diff mbox series

[RFC,1/1] fs/namespace.c: use spinlock instead of busy loop

Message ID 20200617104058.14902-2-john.ogness@linutronix.de (mailing list archive)
State New, archived
Headers show
Series fs: remove retry loop | expand

Commit Message

John Ogness June 17, 2020, 10:40 a.m. UTC
The MNT_WRITE_HOLD flag is used to manually implement a per-cpu
optimized rwsem using busy waiting. This implementation is a problem
on PREEMPT_RT because write_seqlock() on @mount_lock (i.e. taking a
spinlock) does not disable preemption. This allows a writer to
preempt a task that has set MNT_WRITE_HOLD and thus that writer will
live lock in __mnt_want_write() due to busy looping with preemption
disabled.

Reimplement the same semantics using per-cpu spinlocks. This
provides lockdep coverage and makes the code RT ready.

Since this reverts some of the optimization work of
  commit d3ef3d7351cc ("fs: mnt_want_write speedup")
lmbench lat_mmap tests were performed to verify that there is no
obvious performance degradation.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
---
 Here is the detailed test information...

 TEST COMMAND
 lat_mmap -P $pval -W 32 -N 50 64m file

 OUTPUT FORMAT
 pval: avg std

 RESULTS (32 CPUs)
 No Forced Preemption
           BEFORE         AFTER
  1:   275.60  1.82   274.40  0.55
  2:   296.20  3.83   286.80  1.92
  4:   310.20  4.44   304.40  2.51
  8:   359.20  2.28   357.80  2.95
 16:   417.40  2.51   412.20  3.90
 32:   625.60  2.07   622.00  3.08
 64:  1202.60 15.87  1202.20  6.14

 No Forced Preemption, no PTI
           BEFORE         AFTER
  1:   278.00  2.12   274.40  1.52
  2:   301.00  3.67   289.80  6.06
  4:   333.40  7.73   303.80  2.39
  8:   389.80  3.56   351.80  3.42
 16:   425.00  3.46   408.20  4.87
 32:   606.00  1.22   605.60  1.82
 64:  1193.60  7.09  1184.80  4.27

 Voluntary Kernel Preemption
           BEFORE         AFTER
  1:   277.80  1.30   278.20  1.10
  2:   291.20  0.84   286.60  2.30
  4:   310.00  1.87   304.80  1.30
  8:   360.00  2.55   354.60  1.14
 16:   414.00  1.58   414.00  2.35
 32:   619.60  5.50   607.00  3.74
 64:  1224.00  8.40  1219.60  6.19

 Voluntary Kernel Preemption, no PTI
           BEFORE         AFTER
  1:   277.80  4.66   276.40  0.89
  2:   291.40  6.54   286.40  3.58
  4:   310.00  1.22   315.40  1.14
  8:   357.20  0.84   361.40  2.61
 16:   405.60  2.88   407.60  2.51
 32:   615.40  2.30   611.60  5.55
 64:  1219.80  9.91  1207.40 10.88

 Preemptible Kernel
           BEFORE         AFTER
  1:   283.80  0.45   286.80  0.84
  2:   293.40  2.51   294.40  3.51
  4:   318.20  1.30   315.60  1.95
  8:   367.00  0.71   363.00  1.22
 16:   416.20  1.64   413.20  4.87
 32:   628.80  2.28   617.40  2.97
 64:  1277.20  9.88  1254.20  4.87

 Preemptible Kernel, no PTI
           BEFORE         AFTER
  1:   283.00  1.73   288.40  1.67
  2:   305.80  2.05   297.00  3.24
  4:   321.40  4.34   318.60  2.79
  8:   370.20  2.39   366.40  2.70
 16:   413.20  3.11   412.40  2.41
 32:   616.40  2.61   620.20  2.05
 64:  1266.00  6.48  1255.80  3.90

 fs/mount.h     |   7 +++
 fs/namespace.c | 118 +++++++++++++++++++++++++++++++++----------------
 2 files changed, 86 insertions(+), 39 deletions(-)

Comments

Peter Zijlstra June 17, 2020, 11:26 a.m. UTC | #1
On Wed, Jun 17, 2020 at 12:46:58PM +0206, John Ogness wrote:
> @@ -459,17 +469,39 @@ void mnt_drop_write_file(struct file *file)
>  }
>  EXPORT_SYMBOL(mnt_drop_write_file);
>  
> +static void mnt_lock_writers(struct mount *mnt)
> +{
> +#ifdef CONFIG_SMP
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		spin_lock(&per_cpu_ptr(mnt->mnt_pcp,
> +				       cpu)->mnt_writers_lock);
> +	}
> +#else
> +	spin_lock(&mnt->mnt_writers_lock);
> +#endif
> +}
> +
> +static void mnt_unlock_writers(struct mount *mnt)
> +{
> +#ifdef CONFIG_SMP
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		spin_unlock(&per_cpu_ptr(mnt->mnt_pcp,
> +					 cpu)->mnt_writers_lock);
> +	}
> +#else
> +	spin_unlock(&mnt->mnt_writers_lock);
> +#endif
> +}

*groan*.. this is brlock reincarnate :-/ Also broken.
Peter Zijlstra June 17, 2020, 11:27 a.m. UTC | #2
On Wed, Jun 17, 2020 at 12:46:58PM +0206, John Ogness wrote:
> The MNT_WRITE_HOLD flag is used to manually implement a per-cpu
> optimized rwsem using busy waiting. This implementation is a problem
> on PREEMPT_RT because write_seqlock() on @mount_lock (i.e. taking a
> spinlock) does not disable preemption. This allows a writer to
> preempt a task that has set MNT_WRITE_HOLD and thus that writer will
> live lock in __mnt_want_write() due to busy looping with preemption
> disabled.
> 

Why can't you use a regular percpu-rwsem for this?
Sebastian Andrzej Siewior June 17, 2020, 12:09 p.m. UTC | #3
On 2020-06-17 13:27:19 [+0200], Peter Zijlstra wrote:
> Why can't you use a regular percpu-rwsem for this?

This is the percpu-rwsem version. The lock is per struct super_block not
per mount point. This is due to the restriction in
sb_prepare_remount_readonly().

diff --git a/fs/mount.h b/fs/mount.h
index 711a4093e475..344fae9931b5 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -4,6 +4,7 @@
 #include <linux/poll.h>
 #include <linux/ns_common.h>
 #include <linux/fs_pin.h>
+#include <linux/percpu-rwsem.h>
 
 struct mnt_namespace {
 	atomic_t		count;
@@ -40,6 +41,7 @@ struct mount {
 		struct rcu_head mnt_rcu;
 		struct llist_node mnt_llist;
 	};
+	struct percpu_rw_semaphore mnt_writers_rws;
 #ifdef CONFIG_SMP
 	struct mnt_pcp __percpu *mnt_pcp;
 #else
diff --git a/fs/namespace.c b/fs/namespace.c
index a28e4db075ed..cc05c922d7d6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -311,30 +311,17 @@ static int mnt_is_readonly(struct vfsmount *mnt)
 int __mnt_want_write(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
+	struct super_block *sb = m->mnt_sb;
 	int ret = 0;
 
-	preempt_disable();
-	mnt_inc_writers(mnt);
-	/*
-	 * The store to mnt_inc_writers must be visible before we pass
-	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
-	 * incremented count after it has set MNT_WRITE_HOLD.
-	 */
-	smp_mb();
-	while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
-		cpu_relax();
-	/*
-	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
-	 * be set to match its requirements. So we must not load that until
-	 * MNT_WRITE_HOLD is cleared.
-	 */
-	smp_rmb();
-	if (mnt_is_readonly(m)) {
-		mnt_dec_writers(mnt);
+	percpu_down_read(&sb->mnt_writers_rws);
+
+	if (mnt_is_readonly(m))
 		ret = -EROFS;
-	}
-	preempt_enable();
+	else
+		mnt_inc_writers(mnt);
 
+	percpu_up_read(&sb->mnt_writers_rws);
 	return ret;
 }
 
@@ -459,53 +446,30 @@ void mnt_drop_write_file(struct file *file)
 }
 EXPORT_SYMBOL(mnt_drop_write_file);
 
-static int mnt_make_readonly(struct mount *mnt)
+static int mnt_make_readonly(struct super_block *sb, struct mount *mnt)
 {
 	int ret = 0;
 
+	percpu_down_write(&sb->mnt_writers_rws);
 	lock_mount_hash();
-	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
-	/*
-	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
-	 * should be visible before we do.
-	 */
-	smp_mb();
 
-	/*
-	 * With writers on hold, if this value is zero, then there are
-	 * definitely no active writers (although held writers may subsequently
-	 * increment the count, they'll have to wait, and decrement it after
-	 * seeing MNT_READONLY).
-	 *
-	 * It is OK to have counter incremented on one CPU and decremented on
-	 * another: the sum will add up correctly. The danger would be when we
-	 * sum up each counter, if we read a counter before it is incremented,
-	 * but then read another CPU's count which it has been subsequently
-	 * decremented from -- we would see more decrements than we should.
-	 * MNT_WRITE_HOLD protects against this scenario, because
-	 * mnt_want_write first increments count, then smp_mb, then spins on
-	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
-	 * we're counting up here.
-	 */
 	if (mnt_get_writers(mnt) > 0)
 		ret = -EBUSY;
 	else
 		mnt->mnt.mnt_flags |= MNT_READONLY;
-	/*
-	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
-	 * that become unheld will see MNT_READONLY.
-	 */
-	smp_wmb();
-	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
+
 	unlock_mount_hash();
+	percpu_up_write(&sb->mnt_writers_rws);
 	return ret;
 }
 
-static int __mnt_unmake_readonly(struct mount *mnt)
+static int __mnt_unmake_readonly(struct super_block *sb, struct mount *mnt)
 {
+	percpu_down_write(&sb->mnt_writers_rws);
 	lock_mount_hash();
 	mnt->mnt.mnt_flags &= ~MNT_READONLY;
 	unlock_mount_hash();
+	percpu_up_write(&sb->mnt_writers_rws);
 	return 0;
 }
 
@@ -514,21 +478,22 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 	struct mount *mnt;
 	int err = 0;
 
-	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
+	/* Racy optimization.  Recheck the counter under mnt_writers_rws. */
 	if (atomic_long_read(&sb->s_remove_count))
 		return -EBUSY;
 
+	percpu_down_write(&sb->mnt_writers_rws);
 	lock_mount_hash();
+
 	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
 		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
-			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
-			smp_mb();
 			if (mnt_get_writers(mnt) > 0) {
 				err = -EBUSY;
 				break;
 			}
 		}
 	}
+
 	if (!err && atomic_long_read(&sb->s_remove_count))
 		err = -EBUSY;
 
@@ -536,11 +501,9 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 		sb->s_readonly_remount = 1;
 		smp_wmb();
 	}
-	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
-		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
-			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
-	}
+
 	unlock_mount_hash();
+	percpu_up_write(&sb->mnt_writers_rws);
 
 	return err;
 }
@@ -1036,7 +999,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	}
 
 	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
-	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
+	mnt->mnt.mnt_flags &= ~(MNT_MARKED|MNT_INTERNAL);
 
 	atomic_inc(&sb->s_active);
 	mnt->mnt.mnt_sb = sb;
@@ -2440,7 +2403,8 @@ static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
 	return true;
 }
 
-static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
+static int change_mount_ro_state(struct super_block *sb,
+				 struct mount *mnt, unsigned int mnt_flags)
 {
 	bool readonly_request = (mnt_flags & MNT_READONLY);
 
@@ -2448,9 +2412,9 @@ static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
 		return 0;
 
 	if (readonly_request)
-		return mnt_make_readonly(mnt);
+		return mnt_make_readonly(sb, mnt);
 
-	return __mnt_unmake_readonly(mnt);
+	return __mnt_unmake_readonly(sb, mnt);
 }
 
 /*
@@ -2509,7 +2473,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
 		return -EPERM;
 
 	down_write(&sb->s_umount);
-	ret = change_mount_ro_state(mnt, mnt_flags);
+	ret = change_mount_ro_state(sb, mnt, mnt_flags);
 	if (ret == 0)
 		set_mount_attributes(mnt, mnt_flags);
 	up_write(&sb->s_umount);
diff --git a/fs/super.c b/fs/super.c
index a288cd60d2ae..b6036e5fbe8d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -210,6 +210,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	INIT_LIST_HEAD(&s->s_mounts);
 	s->s_user_ns = get_user_ns(user_ns);
 	init_rwsem(&s->s_umount);
+	percpu_init_rwsem(&s->mnt_writers_rws);
+
 	lockdep_set_class(&s->s_umount, &type->s_umount_key);
 	/*
 	 * sget() can have s_umount recursion.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 45cc10cdf6dd..cc259b7c4ee8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1458,6 +1458,7 @@ struct super_block {
 #endif
 	struct hlist_bl_head	s_roots;	/* alternate root dentries for NFS */
 	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
+	struct percpu_rw_semaphore mnt_writers_rws;
 	struct block_device	*s_bdev;
 	struct backing_dev_info *s_bdi;
 	struct mtd_info		*s_mtd;
diff --git a/include/linux/mount.h b/include/linux/mount.h
index bf8cc4108b8f..8daf217108e8 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -32,7 +32,6 @@ struct fs_context;
 #define MNT_READONLY	0x40	/* does the user want this to be r/o? */
 
 #define MNT_SHRINKABLE	0x100
-#define MNT_WRITE_HOLD	0x200
 
 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
@@ -49,7 +48,7 @@ struct fs_context;
 				 | MNT_READONLY)
 #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
 
-#define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
+#define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_INTERNAL | \
 			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED)
 
 #define MNT_INTERNAL	0x4000
diff mbox series

Patch

diff --git a/fs/mount.h b/fs/mount.h
index c7abb7b394d8..627e635cd7d1 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -28,6 +28,12 @@  struct mnt_namespace {
 struct mnt_pcp {
 	int mnt_count;
 	int mnt_writers;
+	/*
+	 * If holding multiple instances of this lock, they
+	 * must be ordered by cpu number.
+	 */
+	spinlock_t mnt_writers_lock;
+	struct lock_class_key mnt_writers_lock_key;
 };
 
 struct mountpoint {
@@ -51,6 +57,7 @@  struct mount {
 #else
 	int mnt_count;
 	int mnt_writers;
+	spinlock_t mnt_writers_lock;
 #endif
 	struct list_head mnt_mounts;	/* list of children, anchored here */
 	struct list_head mnt_child;	/* and going through their mnt_child */
diff --git a/fs/namespace.c b/fs/namespace.c
index f30ed401cc6d..e292c91f966d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -177,6 +177,7 @@  static struct mount *alloc_vfsmnt(const char *name)
 	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
 	if (mnt) {
 		int err;
+		int cpu;
 
 		err = mnt_alloc_id(mnt);
 		if (err)
@@ -194,9 +195,21 @@  static struct mount *alloc_vfsmnt(const char *name)
 			goto out_free_devname;
 
 		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
+
+		for_each_possible_cpu(cpu) {
+			struct mnt_pcp *writer =
+					per_cpu_ptr(mnt->mnt_pcp, cpu);
+
+			lockdep_register_key(&writer->mnt_writers_lock_key);
+			spin_lock_init(&writer->mnt_writers_lock);
+			lockdep_set_class(&writer->mnt_writers_lock,
+					  &writer->mnt_writers_lock_key);
+		}
 #else
+		(void)cpu;
 		mnt->mnt_count = 1;
 		mnt->mnt_writers = 0;
+		spin_lock_init(&mnt->mnt_writers_lock);
 #endif
 
 		INIT_HLIST_NODE(&mnt->mnt_hash);
@@ -311,29 +324,26 @@  static int mnt_is_readonly(struct vfsmount *mnt)
 int __mnt_want_write(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
+	spinlock_t *lock;
 	int ret = 0;
 
-	preempt_disable();
-	mnt_inc_writers(mnt);
-	/*
-	 * The store to mnt_inc_writers must be visible before we pass
-	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
-	 * incremented count after it has set MNT_WRITE_HOLD.
-	 */
-	smp_mb();
-	while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
-		cpu_relax();
-	/*
-	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
-	 * be set to match its requirements. So we must not load that until
-	 * MNT_WRITE_HOLD is cleared.
-	 */
-	smp_rmb();
-	if (mnt_is_readonly(m)) {
-		mnt_dec_writers(mnt);
+#ifdef CONFIG_SMP
+	migrate_disable();
+	lock = &this_cpu_ptr(mnt->mnt_pcp)->mnt_writers_lock;
+#else
+	lock = &mnt->mnt_writers_lock;
+#endif
+
+	spin_lock(lock);
+	if (mnt_is_readonly(m))
 		ret = -EROFS;
-	}
-	preempt_enable();
+	else
+		mnt_inc_writers(mnt);
+	spin_unlock(lock);
+
+#ifdef CONFIG_SMP
+	migrate_enable();
+#endif
 
 	return ret;
 }
@@ -459,17 +469,39 @@  void mnt_drop_write_file(struct file *file)
 }
 EXPORT_SYMBOL(mnt_drop_write_file);
 
+static void mnt_lock_writers(struct mount *mnt)
+{
+#ifdef CONFIG_SMP
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		spin_lock(&per_cpu_ptr(mnt->mnt_pcp,
+				       cpu)->mnt_writers_lock);
+	}
+#else
+	spin_lock(&mnt->mnt_writers_lock);
+#endif
+}
+
+static void mnt_unlock_writers(struct mount *mnt)
+{
+#ifdef CONFIG_SMP
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		spin_unlock(&per_cpu_ptr(mnt->mnt_pcp,
+					 cpu)->mnt_writers_lock);
+	}
+#else
+	spin_unlock(&mnt->mnt_writers_lock);
+#endif
+}
+
 static int mnt_make_readonly(struct mount *mnt)
 {
 	int ret = 0;
 
 	lock_mount_hash();
-	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
-	/*
-	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
-	 * should be visible before we do.
-	 */
-	smp_mb();
 
 	/*
 	 * With writers on hold, if this value is zero, then there are
@@ -482,21 +514,17 @@  static int mnt_make_readonly(struct mount *mnt)
 	 * sum up each counter, if we read a counter before it is incremented,
 	 * but then read another CPU's count which it has been subsequently
 	 * decremented from -- we would see more decrements than we should.
-	 * MNT_WRITE_HOLD protects against this scenario, because
-	 * mnt_want_write first increments count, then smp_mb, then spins on
-	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
-	 * we're counting up here.
+	 * mnt_writers_lock protects against this scenario, because all CPUs
+	 * are prevented from incrementing the counter until the summation of
+	 * all CPU counters is complete.
 	 */
+	mnt_lock_writers(mnt);
 	if (mnt_get_writers(mnt) > 0)
 		ret = -EBUSY;
 	else
 		mnt->mnt.mnt_flags |= MNT_READONLY;
-	/*
-	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
-	 * that become unheld will see MNT_READONLY.
-	 */
-	smp_wmb();
-	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
+	mnt_unlock_writers(mnt);
+
 	unlock_mount_hash();
 	return ret;
 }
@@ -514,15 +542,15 @@  int sb_prepare_remount_readonly(struct super_block *sb)
 	struct mount *mnt;
 	int err = 0;
 
-	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
+	/* Racy optimization.  Recheck the counter under mnt_writers_lock. */
 	if (atomic_long_read(&sb->s_remove_count))
 		return -EBUSY;
 
 	lock_mount_hash();
 	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
 		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
+			mnt_lock_writers(mnt);
 			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
-			smp_mb();
 			if (mnt_get_writers(mnt) > 0) {
 				err = -EBUSY;
 				break;
@@ -537,8 +565,10 @@  int sb_prepare_remount_readonly(struct super_block *sb)
 		smp_wmb();
 	}
 	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
-		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
+		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) {
 			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
+			mnt_unlock_writers(mnt);
+		}
 	}
 	unlock_mount_hash();
 
@@ -1099,6 +1129,7 @@  static void cleanup_mnt(struct mount *mnt)
 {
 	struct hlist_node *p;
 	struct mount *m;
+	int cpu;
 	/*
 	 * The warning here probably indicates that somebody messed
 	 * up a mnt_want/drop_write() pair.  If this happens, the
@@ -1117,6 +1148,15 @@  static void cleanup_mnt(struct mount *mnt)
 	dput(mnt->mnt.mnt_root);
 	deactivate_super(mnt->mnt.mnt_sb);
 	mnt_free_id(mnt);
+#ifdef CONFIG_SMP
+	for_each_possible_cpu(cpu) {
+		struct mnt_pcp *writer = per_cpu_ptr(mnt->mnt_pcp, cpu);
+
+		lockdep_unregister_key(&writer->mnt_writers_lock_key);
+	}
+#else
+	(void)cpu;
+#endif
 	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
 }