diff mbox series

[v9,4/4] xfs: replace mrlock_t with rw_semaphores

Message ID 20201006191541.115364-5-preichl@redhat.com (mailing list archive)
State Superseded
Headers show
Series xfs: Remove wrappers for some semaphores | expand

Commit Message

Pavel Reichl Oct. 6, 2020, 7:15 p.m. UTC
Remove mrlock_t as it does not provide any extra value over
rw_semaphores. Make i_lock and i_mmaplock native rw_semaphores and
replace mr*() functions with native rwsem calls.

Release the lock in xfs_btree_split() just before the work-queue
executing xfs_btree_split_worker() is scheduled and make
xfs_btree_split_worker() to acquire the lock as a first thing and
release it just before returning from the function. This it done so the
ownership of the lock is transfered between kernel threads and thus
lockdep won't complain about lock being held by a different kernel
thread.

Signed-off-by: Pavel Reichl <preichl@redhat.com>
---
 fs/xfs/libxfs/xfs_btree.c | 14 +++++++
 fs/xfs/mrlock.h           | 78 ---------------------------------------
 fs/xfs/xfs_inode.c        | 36 ++++++++++--------
 fs/xfs/xfs_inode.h        |  4 +-
 fs/xfs/xfs_iops.c         |  4 +-
 fs/xfs/xfs_linux.h        |  2 +-
 fs/xfs/xfs_super.c        |  6 +--
 7 files changed, 41 insertions(+), 103 deletions(-)
 delete mode 100644 fs/xfs/mrlock.h

Comments

Darrick J. Wong Oct. 7, 2020, 1:21 a.m. UTC | #1
On Tue, Oct 06, 2020 at 09:15:41PM +0200, Pavel Reichl wrote:
> Remove mrlock_t as it does not provide any extra value over
> rw_semaphores. Make i_lock and i_mmaplock native rw_semaphores and
> replace mr*() functions with native rwsem calls.
> 
> Release the lock in xfs_btree_split() just before the work-queue
> executing xfs_btree_split_worker() is scheduled and make
> xfs_btree_split_worker() to acquire the lock as a first thing and
> release it just before returning from the function. This it done so the
> ownership of the lock is transfered between kernel threads and thus
> lockdep won't complain about lock being held by a different kernel
> thread.
> 
> Signed-off-by: Pavel Reichl <preichl@redhat.com>
> ---
>  fs/xfs/libxfs/xfs_btree.c | 14 +++++++
>  fs/xfs/mrlock.h           | 78 ---------------------------------------
>  fs/xfs/xfs_inode.c        | 36 ++++++++++--------
>  fs/xfs/xfs_inode.h        |  4 +-
>  fs/xfs/xfs_iops.c         |  4 +-
>  fs/xfs/xfs_linux.h        |  2 +-
>  fs/xfs/xfs_super.c        |  6 +--
>  7 files changed, 41 insertions(+), 103 deletions(-)
>  delete mode 100644 fs/xfs/mrlock.h
> 
> diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> index 2d25bab68764..1d1bb8423688 100644
> --- a/fs/xfs/libxfs/xfs_btree.c
> +++ b/fs/xfs/libxfs/xfs_btree.c
> @@ -2816,6 +2816,7 @@ xfs_btree_split_worker(
>  	unsigned long		pflags;
>  	unsigned long		new_pflags = PF_MEMALLOC_NOFS;
>  
> +	rwsem_acquire(&args->cur->bc_ino.ip->i_lock.dep_map, 0, 0, _RET_IP_);

These calls also need a comment explaining just what they're doing.

>  	/*
>  	 * we are in a transaction context here, but may also be doing work
>  	 * in kswapd context, and hence we may need to inherit that state
> @@ -2832,6 +2833,7 @@ xfs_btree_split_worker(
>  	complete(args->done);
>  
>  	current_restore_flags_nested(&pflags, new_pflags);
> +	rwsem_release(&args->cur->bc_ino.ip->i_lock.dep_map, _THIS_IP_);

Note that as soon as you call complete(), xfs_btree_split can wake up
and return, which means that *args could now point to reclaimed stack
space.  This leads to crashes and memory corruption in generic/562 on
a 1k block filesystem (though in principle this can happen anywhere):

[  227.611722] =====================================
[  227.612673] WARNING: bad unlock balance detected!
[  227.613539] 5.9.0-rc4-djw #rc4 Not tainted
[  227.614290] -------------------------------------
[  227.615141] kworker/1:25/12941 is trying to release lock (
[  227.615154] general protection fault, probably for non-canonical address 0x485fc44ba1158c55: 0000 [#1] PREEMPT SMP
[  227.617903] CPU: 1 PID: 12941 Comm: kworker/1:25 Not tainted 5.9.0-rc4-djw #rc4
[  227.619171] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1 04/01/2014
[  227.620731] Workqueue: xfsalloc xfs_btree_split_worker [xfs]
[  227.621749] RIP: 0010:print_unlock_imbalance_bug.cold+0x4e/0xb4
[  227.622800] Code: e8 d4 fb ff ff 48 c7 c7 78 7a e1 81 e8 0a d0 00 00 8b 95 d0 04 00 00 48 8d b5 e0 06 00 00 48 c7 c7 a8 7a e1 81 e8 f1 cf 00 00 <48> 8b 73 18 48 8b 3b e8 ba fd ff ff 48 c7 c7 6b 74 e1 81 e8 d9
 cf
[  227.625977] RSP: 0018:ffffc90001927dd0 EFLAGS: 00010046
[  227.626915] RAX: 000000000000002e RBX: 485fc44ba1158c55 RCX: 0000000000000000
[  227.628177] RDX: 0000000000000000 RSI: ffffffff810e7d5f RDI: 00000000ffffffff
[  227.629434] RBP: ffff8880304ac000 R08: 00000034feeb6ecf R09: 0000000000000001
[  227.630678] R10: 0000000000000046 R11: ffffffff83204b74 R12: ffffffffa037ff3b
[  227.631922] R13: ffffffffa037ff3b R14: 0000000000000246 R15: 0000000000000003
[  227.633181] FS:  0000000000000000(0000) GS:ffff88803ec00000(0000) knlGS:0000000000000000
[  227.634595] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  227.635615] CR2: 00007ff762201d90 CR3: 0000000077756001 CR4: 00000000001706a0
[  227.636869] Call Trace:
[  227.637360]  lock_release+0x169/0x3f0
[  227.638043]  process_one_work+0x23b/0x5a0
[  227.638782]  worker_thread+0x54/0x3a0
[  227.639468]  ? process_one_work+0x5a0/0x5a0
[  227.640193]  kthread+0x13c/0x180
[  227.640754]  ? kthread_park+0x90/0x90
[  227.641392]  ret_from_fork+0x1f/0x30
[  227.642005] Modules linked in: btrfs blake2b_generic xor zstd_compress lzo_compress lzo_decompress zlib_deflate raid6_pq dm_flakey xfs libcrc32c ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_REDIRECT ip_set_hash_ip ip_set_hash_net xt_tcpudp xt_set iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set_hash_mac ip_set nfnetlink ip6table_filter ip6_tables iptable_filter bfq sch_fq_codel ip_tables x_tables overlay nfsv4 af_packet [last unloaded: scsi_debug]
[  227.648071] Dumping ftrace buffer:
[  227.648590]    (ftrace buffer empty)
[  227.649135] ---[ end trace 91c58b635eaa3d46 ]---
[  227.649792] RIP: 0010:print_unlock_imbalance_bug.cold+0x4e/0xb4

Also, reverting just this patch leads to compilation errors.

--D

>  }
>  
>  /*
> @@ -2863,8 +2865,20 @@ xfs_btree_split(
>  	args.done = &done;
>  	args.kswapd = current_is_kswapd();
>  	INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
> +	/*
> +	 * Update lockdep's ownership information to reflect that we
> +	 * will be transferring the ilock from this thread to the
> +	 * worker.
> +	 */
> +	rwsem_release(&cur->bc_ino.ip->i_lock.dep_map, _THIS_IP_);
>  	queue_work(xfs_alloc_wq, &args.work);
>  	wait_for_completion(&done);
> +	/*
> +	 * Update lockdep's lock ownership information to point to
> +	 * this thread as the lock owner now that the worker item is
> +	 * done.
> +	 */
> +	rwsem_acquire(&cur->bc_ino.ip->i_lock.dep_map, 0, 0, _RET_IP_);
>  	destroy_work_on_stack(&args.work);
>  	return args.result;
>  }
> diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h
> deleted file mode 100644
> index 79155eec341b..000000000000
> --- a/fs/xfs/mrlock.h
> +++ /dev/null
> @@ -1,78 +0,0 @@
> -// SPDX-License-Identifier: GPL-2.0
> -/*
> - * Copyright (c) 2000-2006 Silicon Graphics, Inc.
> - * All Rights Reserved.
> - */
> -#ifndef __XFS_SUPPORT_MRLOCK_H__
> -#define __XFS_SUPPORT_MRLOCK_H__
> -
> -#include <linux/rwsem.h>
> -
> -typedef struct {
> -	struct rw_semaphore	mr_lock;
> -#if defined(DEBUG) || defined(XFS_WARN)
> -	int			mr_writer;
> -#endif
> -} mrlock_t;
> -
> -#if defined(DEBUG) || defined(XFS_WARN)
> -#define mrinit(mrp, name)	\
> -	do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
> -#else
> -#define mrinit(mrp, name)	\
> -	do { init_rwsem(&(mrp)->mr_lock); } while (0)
> -#endif
> -
> -#define mrlock_init(mrp, t,n,s)	mrinit(mrp, n)
> -#define mrfree(mrp)		do { } while (0)
> -
> -static inline void mraccess_nested(mrlock_t *mrp, int subclass)
> -{
> -	down_read_nested(&mrp->mr_lock, subclass);
> -}
> -
> -static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
> -{
> -	down_write_nested(&mrp->mr_lock, subclass);
> -#if defined(DEBUG) || defined(XFS_WARN)
> -	mrp->mr_writer = 1;
> -#endif
> -}
> -
> -static inline int mrtryaccess(mrlock_t *mrp)
> -{
> -	return down_read_trylock(&mrp->mr_lock);
> -}
> -
> -static inline int mrtryupdate(mrlock_t *mrp)
> -{
> -	if (!down_write_trylock(&mrp->mr_lock))
> -		return 0;
> -#if defined(DEBUG) || defined(XFS_WARN)
> -	mrp->mr_writer = 1;
> -#endif
> -	return 1;
> -}
> -
> -static inline void mrunlock_excl(mrlock_t *mrp)
> -{
> -#if defined(DEBUG) || defined(XFS_WARN)
> -	mrp->mr_writer = 0;
> -#endif
> -	up_write(&mrp->mr_lock);
> -}
> -
> -static inline void mrunlock_shared(mrlock_t *mrp)
> -{
> -	up_read(&mrp->mr_lock);
> -}
> -
> -static inline void mrdemote(mrlock_t *mrp)
> -{
> -#if defined(DEBUG) || defined(XFS_WARN)
> -	mrp->mr_writer = 0;
> -#endif
> -	downgrade_write(&mrp->mr_lock);
> -}
> -
> -#endif /* __XFS_SUPPORT_MRLOCK_H__ */
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 035925d406d5..213a4a947854 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -191,14 +191,15 @@ xfs_ilock(
>  	}
>  
>  	if (lock_flags & XFS_MMAPLOCK_EXCL)
> -		mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
> +		down_write_nested(&ip->i_mmaplock,
> +				XFS_MMAPLOCK_DEP(lock_flags));
>  	else if (lock_flags & XFS_MMAPLOCK_SHARED)
> -		mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
> +		down_read_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
>  
>  	if (lock_flags & XFS_ILOCK_EXCL)
> -		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
> +		down_write_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
>  	else if (lock_flags & XFS_ILOCK_SHARED)
> -		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
> +		down_read_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
>  }
>  
>  /*
> @@ -242,27 +243,27 @@ xfs_ilock_nowait(
>  	}
>  
>  	if (lock_flags & XFS_MMAPLOCK_EXCL) {
> -		if (!mrtryupdate(&ip->i_mmaplock))
> +		if (!down_write_trylock(&ip->i_mmaplock))
>  			goto out_undo_iolock;
>  	} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
> -		if (!mrtryaccess(&ip->i_mmaplock))
> +		if (!down_read_trylock(&ip->i_mmaplock))
>  			goto out_undo_iolock;
>  	}
>  
>  	if (lock_flags & XFS_ILOCK_EXCL) {
> -		if (!mrtryupdate(&ip->i_lock))
> +		if (!down_write_trylock(&ip->i_lock))
>  			goto out_undo_mmaplock;
>  	} else if (lock_flags & XFS_ILOCK_SHARED) {
> -		if (!mrtryaccess(&ip->i_lock))
> +		if (!down_read_trylock(&ip->i_lock))
>  			goto out_undo_mmaplock;
>  	}
>  	return 1;
>  
>  out_undo_mmaplock:
>  	if (lock_flags & XFS_MMAPLOCK_EXCL)
> -		mrunlock_excl(&ip->i_mmaplock);
> +		up_write(&ip->i_mmaplock);
>  	else if (lock_flags & XFS_MMAPLOCK_SHARED)
> -		mrunlock_shared(&ip->i_mmaplock);
> +		up_read(&ip->i_mmaplock);
>  out_undo_iolock:
>  	if (lock_flags & XFS_IOLOCK_EXCL)
>  		up_write(&VFS_I(ip)->i_rwsem);
> @@ -309,14 +310,14 @@ xfs_iunlock(
>  		up_read(&VFS_I(ip)->i_rwsem);
>  
>  	if (lock_flags & XFS_MMAPLOCK_EXCL)
> -		mrunlock_excl(&ip->i_mmaplock);
> +		up_write(&ip->i_mmaplock);
>  	else if (lock_flags & XFS_MMAPLOCK_SHARED)
> -		mrunlock_shared(&ip->i_mmaplock);
> +		up_read(&ip->i_mmaplock);
>  
>  	if (lock_flags & XFS_ILOCK_EXCL)
> -		mrunlock_excl(&ip->i_lock);
> +		up_write(&ip->i_lock);
>  	else if (lock_flags & XFS_ILOCK_SHARED)
> -		mrunlock_shared(&ip->i_lock);
> +		up_read(&ip->i_lock);
>  
>  	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
>  }
> @@ -335,9 +336,9 @@ xfs_ilock_demote(
>  		~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
>  
>  	if (lock_flags & XFS_ILOCK_EXCL)
> -		mrdemote(&ip->i_lock);
> +		downgrade_write(&ip->i_lock);
>  	if (lock_flags & XFS_MMAPLOCK_EXCL)
> -		mrdemote(&ip->i_mmaplock);
> +		downgrade_write(&ip->i_mmaplock);
>  	if (lock_flags & XFS_IOLOCK_EXCL)
>  		downgrade_write(&VFS_I(ip)->i_rwsem);
>  
> @@ -385,11 +386,14 @@ xfs_isilocked(
>  	uint			lock_flags)
>  {
>  	if (lock_flags & (XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)) {
> +		ASSERT(!(lock_flags & ~(XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)));
>  		return __xfs_rwsem_islocked(&ip->i_lock,
>  				(lock_flags >> XFS_ILOCK_FLAG_SHIFT));
>  	}
>  
>  	if (lock_flags & (XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED)) {
> +		ASSERT(!(lock_flags &
> +			~(XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED)));
>  		return __xfs_rwsem_islocked(&ip->i_mmaplock,
>  				(lock_flags >> XFS_MMAPLOCK_FLAG_SHIFT));
>  	}
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index 77d5655191ab..02c98ecfe4c5 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -39,8 +39,8 @@ typedef struct xfs_inode {
>  
>  	/* Transaction and locking information. */
>  	struct xfs_inode_log_item *i_itemp;	/* logging information */
> -	mrlock_t		i_lock;		/* inode lock */
> -	mrlock_t		i_mmaplock;	/* inode mmap IO lock */
> +	struct rw_semaphore	i_lock;		/* inode lock */
> +	struct rw_semaphore	i_mmaplock;	/* inode mmap IO lock */
>  	atomic_t		i_pincount;	/* inode pin count */
>  
>  	/*
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 80a13c8561d8..66cca3e599c7 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -1336,9 +1336,9 @@ xfs_setup_inode(
>  		 */
>  		lockdep_set_class(&inode->i_rwsem,
>  				  &inode->i_sb->s_type->i_mutex_dir_key);
> -		lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
> +		lockdep_set_class(&ip->i_lock, &xfs_dir_ilock_class);
>  	} else {
> -		lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
> +		lockdep_set_class(&ip->i_lock, &xfs_nondir_ilock_class);
>  	}
>  
>  	/*
> diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
> index ab737fed7b12..ba37217f86d2 100644
> --- a/fs/xfs/xfs_linux.h
> +++ b/fs/xfs/xfs_linux.h
> @@ -22,7 +22,6 @@ typedef __u32			xfs_nlink_t;
>  #include "xfs_types.h"
>  
>  #include "kmem.h"
> -#include "mrlock.h"
>  
>  #include <linux/semaphore.h>
>  #include <linux/mm.h>
> @@ -61,6 +60,7 @@ typedef __u32			xfs_nlink_t;
>  #include <linux/ratelimit.h>
>  #include <linux/rhashtable.h>
>  #include <linux/xattr.h>
> +#include <linux/rwsem.h>
>  
>  #include <asm/page.h>
>  #include <asm/div64.h>
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 71ac6c1cdc36..00be9cfa29fa 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -708,10 +708,8 @@ xfs_fs_inode_init_once(
>  	atomic_set(&ip->i_pincount, 0);
>  	spin_lock_init(&ip->i_flags_lock);
>  
> -	mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
> -		     "xfsino", ip->i_ino);
> -	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
> -		     "xfsino", ip->i_ino);
> +	init_rwsem(&ip->i_mmaplock);
> +	init_rwsem(&ip->i_lock);
>  }
>  
>  /*
> -- 
> 2.26.2
>
Eric Sandeen Oct. 7, 2020, 2:17 p.m. UTC | #2
On 10/6/20 8:21 PM, Darrick J. Wong wrote:
> On Tue, Oct 06, 2020 at 09:15:41PM +0200, Pavel Reichl wrote:
>> Remove mrlock_t as it does not provide any extra value over
>> rw_semaphores. Make i_lock and i_mmaplock native rw_semaphores and
>> replace mr*() functions with native rwsem calls.
>>
>> Release the lock in xfs_btree_split() just before the work-queue
>> executing xfs_btree_split_worker() is scheduled and make
>> xfs_btree_split_worker() to acquire the lock as a first thing and
>> release it just before returning from the function. This it done so the
>> ownership of the lock is transfered between kernel threads and thus
>> lockdep won't complain about lock being held by a different kernel
>> thread.
>>
>> Signed-off-by: Pavel Reichl <preichl@redhat.com>
>> ---
>>  fs/xfs/libxfs/xfs_btree.c | 14 +++++++
>>  fs/xfs/mrlock.h           | 78 ---------------------------------------
>>  fs/xfs/xfs_inode.c        | 36 ++++++++++--------
>>  fs/xfs/xfs_inode.h        |  4 +-
>>  fs/xfs/xfs_iops.c         |  4 +-
>>  fs/xfs/xfs_linux.h        |  2 +-
>>  fs/xfs/xfs_super.c        |  6 +--
>>  7 files changed, 41 insertions(+), 103 deletions(-)
>>  delete mode 100644 fs/xfs/mrlock.h
>>
>> diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
>> index 2d25bab68764..1d1bb8423688 100644
>> --- a/fs/xfs/libxfs/xfs_btree.c
>> +++ b/fs/xfs/libxfs/xfs_btree.c
>> @@ -2816,6 +2816,7 @@ xfs_btree_split_worker(
>>  	unsigned long		pflags;
>>  	unsigned long		new_pflags = PF_MEMALLOC_NOFS;
>>  
>> +	rwsem_acquire(&args->cur->bc_ino.ip->i_lock.dep_map, 0, 0, _RET_IP_);
> These calls also need a comment explaining just what they're doing.
> 
>>  	/*
>>  	 * we are in a transaction context here, but may also be doing work
>>  	 * in kswapd context, and hence we may need to inherit that state
>> @@ -2832,6 +2833,7 @@ xfs_btree_split_worker(
>>  	complete(args->done);
>>  
>>  	current_restore_flags_nested(&pflags, new_pflags);
>> +	rwsem_release(&args->cur->bc_ino.ip->i_lock.dep_map, _THIS_IP_);
> Note that as soon as you call complete(), xfs_btree_split can wake up
> and return, which means that *args could now point to reclaimed stack
> space.  This leads to crashes and memory corruption in generic/562 on
> a 1k block filesystem (though in principle this can happen anywhere):


What's the right way out of this; store *ip when we enter the function
and use that to get to the map, rather than args i guess?

Thanks,
-Eric
Darrick J. Wong Oct. 7, 2020, 3:25 p.m. UTC | #3
On Wed, Oct 07, 2020 at 09:17:13AM -0500, Eric Sandeen wrote:
> On 10/6/20 8:21 PM, Darrick J. Wong wrote:
> > On Tue, Oct 06, 2020 at 09:15:41PM +0200, Pavel Reichl wrote:
> >> Remove mrlock_t as it does not provide any extra value over
> >> rw_semaphores. Make i_lock and i_mmaplock native rw_semaphores and
> >> replace mr*() functions with native rwsem calls.
> >>
> >> Release the lock in xfs_btree_split() just before the work-queue
> >> executing xfs_btree_split_worker() is scheduled and make
> >> xfs_btree_split_worker() to acquire the lock as a first thing and
> >> release it just before returning from the function. This it done so the
> >> ownership of the lock is transfered between kernel threads and thus
> >> lockdep won't complain about lock being held by a different kernel
> >> thread.
> >>
> >> Signed-off-by: Pavel Reichl <preichl@redhat.com>
> >> ---
> >>  fs/xfs/libxfs/xfs_btree.c | 14 +++++++
> >>  fs/xfs/mrlock.h           | 78 ---------------------------------------
> >>  fs/xfs/xfs_inode.c        | 36 ++++++++++--------
> >>  fs/xfs/xfs_inode.h        |  4 +-
> >>  fs/xfs/xfs_iops.c         |  4 +-
> >>  fs/xfs/xfs_linux.h        |  2 +-
> >>  fs/xfs/xfs_super.c        |  6 +--
> >>  7 files changed, 41 insertions(+), 103 deletions(-)
> >>  delete mode 100644 fs/xfs/mrlock.h
> >>
> >> diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> >> index 2d25bab68764..1d1bb8423688 100644
> >> --- a/fs/xfs/libxfs/xfs_btree.c
> >> +++ b/fs/xfs/libxfs/xfs_btree.c
> >> @@ -2816,6 +2816,7 @@ xfs_btree_split_worker(
> >>  	unsigned long		pflags;
> >>  	unsigned long		new_pflags = PF_MEMALLOC_NOFS;
> >>  
> >> +	rwsem_acquire(&args->cur->bc_ino.ip->i_lock.dep_map, 0, 0, _RET_IP_);
> > These calls also need a comment explaining just what they're doing.
> > 
> >>  	/*
> >>  	 * we are in a transaction context here, but may also be doing work
> >>  	 * in kswapd context, and hence we may need to inherit that state
> >> @@ -2832,6 +2833,7 @@ xfs_btree_split_worker(
> >>  	complete(args->done);
> >>  
> >>  	current_restore_flags_nested(&pflags, new_pflags);
> >> +	rwsem_release(&args->cur->bc_ino.ip->i_lock.dep_map, _THIS_IP_);
> > Note that as soon as you call complete(), xfs_btree_split can wake up
> > and return, which means that *args could now point to reclaimed stack
> > space.  This leads to crashes and memory corruption in generic/562 on
> > a 1k block filesystem (though in principle this can happen anywhere):
> 
> 
> What's the right way out of this; store *ip when we enter the function
> and use that to get to the map, rather than args i guess?

Er, no, because the worker could also get preempted right after
complete() and take so long to get rescheduled that the the inode have
been reclaimed.  Think about it -- the original thread is waiting on the
completion that it passed to the worker through $args, and therefore the
worker cannot touch any of the resources it was accessing through $args
after calling complete()....

--D

> Thanks,
> -Eric
Pavel Reichl Oct. 7, 2020, 9:15 p.m. UTC | #4
On 10/7/20 5:25 PM, Darrick J. Wong wrote:
> On Wed, Oct 07, 2020 at 09:17:13AM -0500, Eric Sandeen wrote:
>> On 10/6/20 8:21 PM, Darrick J. Wong wrote:
>>> On Tue, Oct 06, 2020 at 09:15:41PM +0200, Pavel Reichl wrote:
>>>> Remove mrlock_t as it does not provide any extra value over
>>>> rw_semaphores. Make i_lock and i_mmaplock native rw_semaphores and
>>>> replace mr*() functions with native rwsem calls.
>>>>
>>>> Release the lock in xfs_btree_split() just before the work-queue
>>>> executing xfs_btree_split_worker() is scheduled and make
>>>> xfs_btree_split_worker() to acquire the lock as a first thing and
>>>> release it just before returning from the function. This it done so the
>>>> ownership of the lock is transfered between kernel threads and thus
>>>> lockdep won't complain about lock being held by a different kernel
>>>> thread.
>>>>
>>>> Signed-off-by: Pavel Reichl <preichl@redhat.com>
>>>> ---
>>>>  fs/xfs/libxfs/xfs_btree.c | 14 +++++++
>>>>  fs/xfs/mrlock.h           | 78 ---------------------------------------
>>>>  fs/xfs/xfs_inode.c        | 36 ++++++++++--------
>>>>  fs/xfs/xfs_inode.h        |  4 +-
>>>>  fs/xfs/xfs_iops.c         |  4 +-
>>>>  fs/xfs/xfs_linux.h        |  2 +-
>>>>  fs/xfs/xfs_super.c        |  6 +--
>>>>  7 files changed, 41 insertions(+), 103 deletions(-)
>>>>  delete mode 100644 fs/xfs/mrlock.h
>>>>
>>>> diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
>>>> index 2d25bab68764..1d1bb8423688 100644
>>>> --- a/fs/xfs/libxfs/xfs_btree.c
>>>> +++ b/fs/xfs/libxfs/xfs_btree.c
>>>> @@ -2816,6 +2816,7 @@ xfs_btree_split_worker(
>>>>  	unsigned long		pflags;
>>>>  	unsigned long		new_pflags = PF_MEMALLOC_NOFS;
>>>>  
>>>> +	rwsem_acquire(&args->cur->bc_ino.ip->i_lock.dep_map, 0, 0, _RET_IP_);
>>> These calls also need a comment explaining just what they're doing.
>>>
>>>>  	/*
>>>>  	 * we are in a transaction context here, but may also be doing work
>>>>  	 * in kswapd context, and hence we may need to inherit that state
>>>> @@ -2832,6 +2833,7 @@ xfs_btree_split_worker(
>>>>  	complete(args->done);
>>>>  
>>>>  	current_restore_flags_nested(&pflags, new_pflags);
>>>> +	rwsem_release(&args->cur->bc_ino.ip->i_lock.dep_map, _THIS_IP_);
>>> Note that as soon as you call complete(), xfs_btree_split can wake up
>>> and return, which means that *args could now point to reclaimed stack
>>> space.  This leads to crashes and memory corruption in generic/562 on
>>> a 1k block filesystem (though in principle this can happen anywhere):
>>
>>
>> What's the right way out of this; store *ip when we enter the function
>> and use that to get to the map, rather than args i guess?
> 
> Er, no, because the worker could also get preempted right after
> complete() and take so long to get rescheduled that the the inode have
> been reclaimed.  Think about it -- the original thread is waiting on the
> completion that it passed to the worker through $args, and therefore the
> worker cannot touch any of the resources it was accessing through $args
> after calling complete()....

Hi,

thanks for the comments, however for some reason I cannot reproduce the same memory corruption you are getting.

Do you think that moving the 'rwsem_release()' right before the 'complete()' should fix the problem?

Something like:


+       /*
+        * Update lockdep's lock ownership information to point to
+        * this thread as the thread that scheduled this worker is waiting
+        * for it's completion.
+        */
        rwsem_acquire(&args->cur->bc_ino.ip->i_lock.dep_map, 0, 0, _RET_IP_);
        /*
         * we are in a transaction context here, but may also be doing work
@@ -2830,10 +2835,15 @@ xfs_btree_split_worker(
 
        args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
                                         args->key, args->curp, args->stat);
+       /*
+        * Update lockdep's lock ownership information to reflect that we will
+        * be transferring the ilock from this worker back to the scheduling
+        * thread.
+        */
+       rwsem_release(&args->cur->bc_ino.ip->i_lock.dep_map, _THIS_IP_);
        complete(args->done);
 
        current_restore_flags_nested(&pflags, new_pflags);
-       rwsem_release(&args->cur->bc_ino.ip->i_lock.dep_map, _THIS_IP_);



> 
> --D
> 
>> Thanks,
>> -Eric
>
Darrick J. Wong Oct. 7, 2020, 9:55 p.m. UTC | #5
On Wed, Oct 07, 2020 at 11:15:32PM +0200, Pavel Reichl wrote:
> 
> 
> On 10/7/20 5:25 PM, Darrick J. Wong wrote:
> > On Wed, Oct 07, 2020 at 09:17:13AM -0500, Eric Sandeen wrote:
> >> On 10/6/20 8:21 PM, Darrick J. Wong wrote:
> >>> On Tue, Oct 06, 2020 at 09:15:41PM +0200, Pavel Reichl wrote:
> >>>> Remove mrlock_t as it does not provide any extra value over
> >>>> rw_semaphores. Make i_lock and i_mmaplock native rw_semaphores and
> >>>> replace mr*() functions with native rwsem calls.
> >>>>
> >>>> Release the lock in xfs_btree_split() just before the work-queue
> >>>> executing xfs_btree_split_worker() is scheduled and make
> >>>> xfs_btree_split_worker() to acquire the lock as a first thing and
> >>>> release it just before returning from the function. This it done so the
> >>>> ownership of the lock is transfered between kernel threads and thus
> >>>> lockdep won't complain about lock being held by a different kernel
> >>>> thread.
> >>>>
> >>>> Signed-off-by: Pavel Reichl <preichl@redhat.com>
> >>>> ---
> >>>>  fs/xfs/libxfs/xfs_btree.c | 14 +++++++
> >>>>  fs/xfs/mrlock.h           | 78 ---------------------------------------
> >>>>  fs/xfs/xfs_inode.c        | 36 ++++++++++--------
> >>>>  fs/xfs/xfs_inode.h        |  4 +-
> >>>>  fs/xfs/xfs_iops.c         |  4 +-
> >>>>  fs/xfs/xfs_linux.h        |  2 +-
> >>>>  fs/xfs/xfs_super.c        |  6 +--
> >>>>  7 files changed, 41 insertions(+), 103 deletions(-)
> >>>>  delete mode 100644 fs/xfs/mrlock.h
> >>>>
> >>>> diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> >>>> index 2d25bab68764..1d1bb8423688 100644
> >>>> --- a/fs/xfs/libxfs/xfs_btree.c
> >>>> +++ b/fs/xfs/libxfs/xfs_btree.c
> >>>> @@ -2816,6 +2816,7 @@ xfs_btree_split_worker(
> >>>>  	unsigned long		pflags;
> >>>>  	unsigned long		new_pflags = PF_MEMALLOC_NOFS;
> >>>>  
> >>>> +	rwsem_acquire(&args->cur->bc_ino.ip->i_lock.dep_map, 0, 0, _RET_IP_);
> >>> These calls also need a comment explaining just what they're doing.
> >>>
> >>>>  	/*
> >>>>  	 * we are in a transaction context here, but may also be doing work
> >>>>  	 * in kswapd context, and hence we may need to inherit that state
> >>>> @@ -2832,6 +2833,7 @@ xfs_btree_split_worker(
> >>>>  	complete(args->done);
> >>>>  
> >>>>  	current_restore_flags_nested(&pflags, new_pflags);
> >>>> +	rwsem_release(&args->cur->bc_ino.ip->i_lock.dep_map, _THIS_IP_);
> >>> Note that as soon as you call complete(), xfs_btree_split can wake up
> >>> and return, which means that *args could now point to reclaimed stack
> >>> space.  This leads to crashes and memory corruption in generic/562 on
> >>> a 1k block filesystem (though in principle this can happen anywhere):
> >>
> >>
> >> What's the right way out of this; store *ip when we enter the function
> >> and use that to get to the map, rather than args i guess?
> > 
> > Er, no, because the worker could also get preempted right after
> > complete() and take so long to get rescheduled that the the inode have
> > been reclaimed.  Think about it -- the original thread is waiting on the
> > completion that it passed to the worker through $args, and therefore the
> > worker cannot touch any of the resources it was accessing through $args
> > after calling complete()....
> 
> Hi,
> 
> thanks for the comments, however for some reason I cannot reproduce
> the same memory corruption you are getting.

<shrug> Do you have full preempt enabled?

> Do you think that moving the 'rwsem_release()' right before the
> 'complete()' should fix the problem?
> 
> Something like:
> 
> 
> +       /*
> +        * Update lockdep's lock ownership information to point to
> +        * this thread as the thread that scheduled this worker is waiting
> +        * for it's completion.

Nit: "it's" is always a contraction of "it is"; "its" is correct
(posessive) form here.

Otherwise, this looks fine to me.

--D

> +        */
>         rwsem_acquire(&args->cur->bc_ino.ip->i_lock.dep_map, 0, 0, _RET_IP_);
>         /*
>          * we are in a transaction context here, but may also be doing work
> @@ -2830,10 +2835,15 @@ xfs_btree_split_worker(
>  
>         args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
>                                          args->key, args->curp, args->stat);
> +       /*
> +        * Update lockdep's lock ownership information to reflect that we will
> +        * be transferring the ilock from this worker back to the scheduling
> +        * thread.
> +        */
> +       rwsem_release(&args->cur->bc_ino.ip->i_lock.dep_map, _THIS_IP_);
>         complete(args->done);
>  
>         current_restore_flags_nested(&pflags, new_pflags);
> -       rwsem_release(&args->cur->bc_ino.ip->i_lock.dep_map, _THIS_IP_);
> 
> 
> 
> > 
> > --D
> > 
> >> Thanks,
> >> -Eric
> > 
>
Pavel Reichl Oct. 8, 2020, 1:55 p.m. UTC | #6
>> Hi,
>>
>> thanks for the comments, however for some reason I cannot reproduce
>> the same memory corruption you are getting.
> 
> <shrug> Do you have full preempt enabled?

Hi, I'm not proud to admit that until now I tested w/o 'CONFIG_PREEMPT=y' :-/
However at least now I can see the bug you hit and test that the proposed change in version #10 fixes that.


> 
>> Do you think that moving the 'rwsem_release()' right before the
>> 'complete()' should fix the problem?
>>
>> Something like:
>>
>>
>> +       /*
>> +        * Update lockdep's lock ownership information to point to
>> +        * this thread as the thread that scheduled this worker is waiting
>> +        * for it's completion.
> 
> Nit: "it's" is always a contraction of "it is"; "its" is correct
> (posessive) form here.

Thanks for noticing. I know the difference...but still I did this mistake. I must focus more next time.

> 
> Otherwise, this looks fine to me.

Thanks, version #10 is on list now.

Bye.
Darrick J. Wong Oct. 8, 2020, 4:16 p.m. UTC | #7
On Thu, Oct 08, 2020 at 03:55:16PM +0200, Pavel Reichl wrote:
> 
> >> Hi,
> >>
> >> thanks for the comments, however for some reason I cannot reproduce
> >> the same memory corruption you are getting.
> > 
> > <shrug> Do you have full preempt enabled?
> 
> Hi, I'm not proud to admit that until now I tested w/o 'CONFIG_PREEMPT=y' :-/
> However at least now I can see the bug you hit and test that the
> proposed change in version #10 fixes that.

<shrug> That just means you get to hit all the stall warnings (which are
fixable with cond_resched()) that I rarely see because preempt kernels
can reschedule at will... :)

> 
> 
> > 
> >> Do you think that moving the 'rwsem_release()' right before the
> >> 'complete()' should fix the problem?
> >>
> >> Something like:
> >>
> >>
> >> +       /*
> >> +        * Update lockdep's lock ownership information to point to
> >> +        * this thread as the thread that scheduled this worker is waiting
> >> +        * for it's completion.
> > 
> > Nit: "it's" is always a contraction of "it is"; "its" is correct
> > (posessive) form here.
> 
> Thanks for noticing. I know the difference...but still I did this
> mistake. I must focus more next time.

No worries, English is a weird language.

"Inflammable means flammable?  What a country!"
          -- Dr. Nick Riviera

--D

> 
> > 
> > Otherwise, this looks fine to me.
> 
> Thanks, version #10 is on list now.
> 
> Bye.
>
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 2d25bab68764..1d1bb8423688 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2816,6 +2816,7 @@  xfs_btree_split_worker(
 	unsigned long		pflags;
 	unsigned long		new_pflags = PF_MEMALLOC_NOFS;
 
+	rwsem_acquire(&args->cur->bc_ino.ip->i_lock.dep_map, 0, 0, _RET_IP_);
 	/*
 	 * we are in a transaction context here, but may also be doing work
 	 * in kswapd context, and hence we may need to inherit that state
@@ -2832,6 +2833,7 @@  xfs_btree_split_worker(
 	complete(args->done);
 
 	current_restore_flags_nested(&pflags, new_pflags);
+	rwsem_release(&args->cur->bc_ino.ip->i_lock.dep_map, _THIS_IP_);
 }
 
 /*
@@ -2863,8 +2865,20 @@  xfs_btree_split(
 	args.done = &done;
 	args.kswapd = current_is_kswapd();
 	INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
+	/*
+	 * Update lockdep's ownership information to reflect that we
+	 * will be transferring the ilock from this thread to the
+	 * worker.
+	 */
+	rwsem_release(&cur->bc_ino.ip->i_lock.dep_map, _THIS_IP_);
 	queue_work(xfs_alloc_wq, &args.work);
 	wait_for_completion(&done);
+	/*
+	 * Update lockdep's lock ownership information to point to
+	 * this thread as the lock owner now that the worker item is
+	 * done.
+	 */
+	rwsem_acquire(&cur->bc_ino.ip->i_lock.dep_map, 0, 0, _RET_IP_);
 	destroy_work_on_stack(&args.work);
 	return args.result;
 }
diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h
deleted file mode 100644
index 79155eec341b..000000000000
--- a/fs/xfs/mrlock.h
+++ /dev/null
@@ -1,78 +0,0 @@ 
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- */
-#ifndef __XFS_SUPPORT_MRLOCK_H__
-#define __XFS_SUPPORT_MRLOCK_H__
-
-#include <linux/rwsem.h>
-
-typedef struct {
-	struct rw_semaphore	mr_lock;
-#if defined(DEBUG) || defined(XFS_WARN)
-	int			mr_writer;
-#endif
-} mrlock_t;
-
-#if defined(DEBUG) || defined(XFS_WARN)
-#define mrinit(mrp, name)	\
-	do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
-#else
-#define mrinit(mrp, name)	\
-	do { init_rwsem(&(mrp)->mr_lock); } while (0)
-#endif
-
-#define mrlock_init(mrp, t,n,s)	mrinit(mrp, n)
-#define mrfree(mrp)		do { } while (0)
-
-static inline void mraccess_nested(mrlock_t *mrp, int subclass)
-{
-	down_read_nested(&mrp->mr_lock, subclass);
-}
-
-static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
-{
-	down_write_nested(&mrp->mr_lock, subclass);
-#if defined(DEBUG) || defined(XFS_WARN)
-	mrp->mr_writer = 1;
-#endif
-}
-
-static inline int mrtryaccess(mrlock_t *mrp)
-{
-	return down_read_trylock(&mrp->mr_lock);
-}
-
-static inline int mrtryupdate(mrlock_t *mrp)
-{
-	if (!down_write_trylock(&mrp->mr_lock))
-		return 0;
-#if defined(DEBUG) || defined(XFS_WARN)
-	mrp->mr_writer = 1;
-#endif
-	return 1;
-}
-
-static inline void mrunlock_excl(mrlock_t *mrp)
-{
-#if defined(DEBUG) || defined(XFS_WARN)
-	mrp->mr_writer = 0;
-#endif
-	up_write(&mrp->mr_lock);
-}
-
-static inline void mrunlock_shared(mrlock_t *mrp)
-{
-	up_read(&mrp->mr_lock);
-}
-
-static inline void mrdemote(mrlock_t *mrp)
-{
-#if defined(DEBUG) || defined(XFS_WARN)
-	mrp->mr_writer = 0;
-#endif
-	downgrade_write(&mrp->mr_lock);
-}
-
-#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 035925d406d5..213a4a947854 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -191,14 +191,15 @@  xfs_ilock(
 	}
 
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
-		mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
+		down_write_nested(&ip->i_mmaplock,
+				XFS_MMAPLOCK_DEP(lock_flags));
 	else if (lock_flags & XFS_MMAPLOCK_SHARED)
-		mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
+		down_read_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
 
 	if (lock_flags & XFS_ILOCK_EXCL)
-		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+		down_write_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 	else if (lock_flags & XFS_ILOCK_SHARED)
-		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+		down_read_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 }
 
 /*
@@ -242,27 +243,27 @@  xfs_ilock_nowait(
 	}
 
 	if (lock_flags & XFS_MMAPLOCK_EXCL) {
-		if (!mrtryupdate(&ip->i_mmaplock))
+		if (!down_write_trylock(&ip->i_mmaplock))
 			goto out_undo_iolock;
 	} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
-		if (!mrtryaccess(&ip->i_mmaplock))
+		if (!down_read_trylock(&ip->i_mmaplock))
 			goto out_undo_iolock;
 	}
 
 	if (lock_flags & XFS_ILOCK_EXCL) {
-		if (!mrtryupdate(&ip->i_lock))
+		if (!down_write_trylock(&ip->i_lock))
 			goto out_undo_mmaplock;
 	} else if (lock_flags & XFS_ILOCK_SHARED) {
-		if (!mrtryaccess(&ip->i_lock))
+		if (!down_read_trylock(&ip->i_lock))
 			goto out_undo_mmaplock;
 	}
 	return 1;
 
 out_undo_mmaplock:
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
-		mrunlock_excl(&ip->i_mmaplock);
+		up_write(&ip->i_mmaplock);
 	else if (lock_flags & XFS_MMAPLOCK_SHARED)
-		mrunlock_shared(&ip->i_mmaplock);
+		up_read(&ip->i_mmaplock);
 out_undo_iolock:
 	if (lock_flags & XFS_IOLOCK_EXCL)
 		up_write(&VFS_I(ip)->i_rwsem);
@@ -309,14 +310,14 @@  xfs_iunlock(
 		up_read(&VFS_I(ip)->i_rwsem);
 
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
-		mrunlock_excl(&ip->i_mmaplock);
+		up_write(&ip->i_mmaplock);
 	else if (lock_flags & XFS_MMAPLOCK_SHARED)
-		mrunlock_shared(&ip->i_mmaplock);
+		up_read(&ip->i_mmaplock);
 
 	if (lock_flags & XFS_ILOCK_EXCL)
-		mrunlock_excl(&ip->i_lock);
+		up_write(&ip->i_lock);
 	else if (lock_flags & XFS_ILOCK_SHARED)
-		mrunlock_shared(&ip->i_lock);
+		up_read(&ip->i_lock);
 
 	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
 }
@@ -335,9 +336,9 @@  xfs_ilock_demote(
 		~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
 
 	if (lock_flags & XFS_ILOCK_EXCL)
-		mrdemote(&ip->i_lock);
+		downgrade_write(&ip->i_lock);
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
-		mrdemote(&ip->i_mmaplock);
+		downgrade_write(&ip->i_mmaplock);
 	if (lock_flags & XFS_IOLOCK_EXCL)
 		downgrade_write(&VFS_I(ip)->i_rwsem);
 
@@ -385,11 +386,14 @@  xfs_isilocked(
 	uint			lock_flags)
 {
 	if (lock_flags & (XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)) {
+		ASSERT(!(lock_flags & ~(XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)));
 		return __xfs_rwsem_islocked(&ip->i_lock,
 				(lock_flags >> XFS_ILOCK_FLAG_SHIFT));
 	}
 
 	if (lock_flags & (XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED)) {
+		ASSERT(!(lock_flags &
+			~(XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED)));
 		return __xfs_rwsem_islocked(&ip->i_mmaplock,
 				(lock_flags >> XFS_MMAPLOCK_FLAG_SHIFT));
 	}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 77d5655191ab..02c98ecfe4c5 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -39,8 +39,8 @@  typedef struct xfs_inode {
 
 	/* Transaction and locking information. */
 	struct xfs_inode_log_item *i_itemp;	/* logging information */
-	mrlock_t		i_lock;		/* inode lock */
-	mrlock_t		i_mmaplock;	/* inode mmap IO lock */
+	struct rw_semaphore	i_lock;		/* inode lock */
+	struct rw_semaphore	i_mmaplock;	/* inode mmap IO lock */
 	atomic_t		i_pincount;	/* inode pin count */
 
 	/*
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 80a13c8561d8..66cca3e599c7 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1336,9 +1336,9 @@  xfs_setup_inode(
 		 */
 		lockdep_set_class(&inode->i_rwsem,
 				  &inode->i_sb->s_type->i_mutex_dir_key);
-		lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
+		lockdep_set_class(&ip->i_lock, &xfs_dir_ilock_class);
 	} else {
-		lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
+		lockdep_set_class(&ip->i_lock, &xfs_nondir_ilock_class);
 	}
 
 	/*
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index ab737fed7b12..ba37217f86d2 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -22,7 +22,6 @@  typedef __u32			xfs_nlink_t;
 #include "xfs_types.h"
 
 #include "kmem.h"
-#include "mrlock.h"
 
 #include <linux/semaphore.h>
 #include <linux/mm.h>
@@ -61,6 +60,7 @@  typedef __u32			xfs_nlink_t;
 #include <linux/ratelimit.h>
 #include <linux/rhashtable.h>
 #include <linux/xattr.h>
+#include <linux/rwsem.h>
 
 #include <asm/page.h>
 #include <asm/div64.h>
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 71ac6c1cdc36..00be9cfa29fa 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -708,10 +708,8 @@  xfs_fs_inode_init_once(
 	atomic_set(&ip->i_pincount, 0);
 	spin_lock_init(&ip->i_flags_lock);
 
-	mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
-		     "xfsino", ip->i_ino);
-	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
-		     "xfsino", ip->i_ino);
+	init_rwsem(&ip->i_mmaplock);
+	init_rwsem(&ip->i_lock);
 }
 
 /*