diff mbox series

[RFC] fs: make s_count atomic_t

Message ID 20231027-neurologie-miterleben-a8c52a745463@brauner (mailing list archive)
State New, archived
Headers show
Series [RFC] fs: make s_count atomic_t | expand

Commit Message

Christian Brauner Oct. 27, 2023, 9:35 a.m. UTC
So, I believe we can drop the sb_lock in bdev_super_lock() for all
holder operations if we turn s_count into an atomic. It will slightly
change semantics for list walks like iterate_supers() but imho that's
fine. It'll mean that list walkes need a acquire sb_lock, then try to
get reference via atomic_inc_not_zero().

The logic there is simply that if you still found the superblock on the
list then yes, someone could now concurrently drop s_count to zero
behind your back. But because you hold sb_lock they can't remove it from
the list behind your back.

So if you now fail atomic_inc_not_zero() then you know that someone
concurrently managed to drop the ref to zero and wants to remove that sb
from the list. So you just ignore that super block and go on walking the
list. If however, you manage to get an active reference things are fine
and you can try to trade that temporary reference for an active
reference. So my theory at least...

Yes, ofc we add atomics but for superblocks we shouldn't care especially
we have less and less list walkers. Both get_super() and
get_active_super() are gone after all.

I'm running xfstests as I'm sending this and I need to start finishing
PRs so in RFC mode. Thoughts?

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/super.c         | 93 ++++++++++++++++++++++++++--------------------
 include/linux/fs.h |  2 +-
 2 files changed, 53 insertions(+), 42 deletions(-)

Comments

Jan Kara Nov. 2, 2023, 1:48 p.m. UTC | #1
On Fri 27-10-23 11:35:20, Christian Brauner wrote:
> So, I believe we can drop the sb_lock in bdev_super_lock() for all
> holder operations if we turn s_count into an atomic. It will slightly
> change semantics for list walks like iterate_supers() but imho that's
> fine. It'll mean that list walkes need a acquire sb_lock, then try to
> get reference via atomic_inc_not_zero().
> 
> The logic there is simply that if you still found the superblock on the
> list then yes, someone could now concurrently drop s_count to zero
> behind your back. But because you hold sb_lock they can't remove it from
> the list behind your back.
> 
> So if you now fail atomic_inc_not_zero() then you know that someone
> concurrently managed to drop the ref to zero and wants to remove that sb
> from the list. So you just ignore that super block and go on walking the
> list. If however, you manage to get an active reference things are fine
> and you can try to trade that temporary reference for an active
> reference. So my theory at least...
> 
> Yes, ofc we add atomics but for superblocks we shouldn't care especially
> we have less and less list walkers. Both get_super() and
> get_active_super() are gone after all.
> 
> I'm running xfstests as I'm sending this and I need to start finishing
> PRs so in RFC mode. Thoughts?
> 
> Signed-off-by: Christian Brauner <brauner@kernel.org>

So in principle I agree we can get rid of some sb_lock use if we convert
sb->s_count to an atomic type. However does this bring any significant
benefit? I would not expect sb_lock to be contended and as you say you need
to be more careful about the races then so that is a reason against such
change.

								Honza

> ---
>  fs/super.c         | 93 ++++++++++++++++++++++++++--------------------
>  include/linux/fs.h |  2 +-
>  2 files changed, 53 insertions(+), 42 deletions(-)
> 
> diff --git a/fs/super.c b/fs/super.c
> index 71e5e61cfc9e..c58de6bb5633 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -375,7 +375,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
>  	INIT_LIST_HEAD(&s->s_inodes_wb);
>  	spin_lock_init(&s->s_inode_wblist_lock);
>  
> -	s->s_count = 1;
> +	atomic_set(&s->s_count, 1);
>  	atomic_set(&s->s_active, 1);
>  	mutex_init(&s->s_vfs_rename_mutex);
>  	lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
> @@ -409,19 +409,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
>  /*
>   * Drop a superblock's refcount.  The caller must hold sb_lock.
>   */
> -static void __put_super(struct super_block *s)
> -{
> -	if (!--s->s_count) {
> -		list_del_init(&s->s_list);
> -		WARN_ON(s->s_dentry_lru.node);
> -		WARN_ON(s->s_inode_lru.node);
> -		WARN_ON(!list_empty(&s->s_mounts));
> -		security_sb_free(s);
> -		put_user_ns(s->s_user_ns);
> -		kfree(s->s_subtype);
> -		call_rcu(&s->rcu, destroy_super_rcu);
> -	}
> -}
>  
>  /**
>   *	put_super	-	drop a temporary reference to superblock
> @@ -430,10 +417,20 @@ static void __put_super(struct super_block *s)
>   *	Drops a temporary reference, frees superblock if there's no
>   *	references left.
>   */
> -void put_super(struct super_block *sb)
> +void put_super(struct super_block *s)
>  {
> +	if (!atomic_dec_and_test(&s->s_count))
> +		return;
> +
>  	spin_lock(&sb_lock);
> -	__put_super(sb);
> +	list_del_init(&s->s_list);
> +	WARN_ON(s->s_dentry_lru.node);
> +	WARN_ON(s->s_inode_lru.node);
> +	WARN_ON(!list_empty(&s->s_mounts));
> +	security_sb_free(s);
> +	put_user_ns(s->s_user_ns);
> +	kfree(s->s_subtype);
> +	call_rcu(&s->rcu, destroy_super_rcu);
>  	spin_unlock(&sb_lock);
>  }
>  
> @@ -548,8 +545,11 @@ static bool grab_super(struct super_block *sb)
>  {
>  	bool locked;
>  
> -	sb->s_count++;
> +	locked = atomic_inc_not_zero(&sb->s_count);
>  	spin_unlock(&sb_lock);
> +	if (!locked)
> +		return false;
> +
>  	locked = super_lock_excl(sb);
>  	if (locked) {
>  		if (atomic_inc_not_zero(&sb->s_active)) {
> @@ -908,19 +908,20 @@ static void __iterate_supers(void (*f)(struct super_block *))
>  		/* Pairs with memory marrier in super_wake(). */
>  		if (smp_load_acquire(&sb->s_flags) & SB_DYING)
>  			continue;
> -		sb->s_count++;
> +		if (!atomic_inc_not_zero(&sb->s_count))
> +			continue;
>  		spin_unlock(&sb_lock);
>  
>  		f(sb);
>  
> -		spin_lock(&sb_lock);
>  		if (p)
> -			__put_super(p);
> +			put_super(p);
>  		p = sb;
> +		spin_lock(&sb_lock);
>  	}
> -	if (p)
> -		__put_super(p);
>  	spin_unlock(&sb_lock);
> +	if (p)
> +		put_super(p);
>  }
>  /**
>   *	iterate_supers - call function for all active superblocks
> @@ -938,7 +939,8 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
>  	list_for_each_entry(sb, &super_blocks, s_list) {
>  		bool locked;
>  
> -		sb->s_count++;
> +		if (!atomic_inc_not_zero(&sb->s_count))
> +			continue;
>  		spin_unlock(&sb_lock);
>  
>  		locked = super_lock_shared(sb);
> @@ -948,14 +950,14 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
>  			super_unlock_shared(sb);
>  		}
>  
> -		spin_lock(&sb_lock);
>  		if (p)
> -			__put_super(p);
> +			put_super(p);
>  		p = sb;
> +		spin_lock(&sb_lock);
>  	}
> -	if (p)
> -		__put_super(p);
>  	spin_unlock(&sb_lock);
> +	if (p)
> +		put_super(p);
>  }
>  
>  /**
> @@ -976,7 +978,8 @@ void iterate_supers_type(struct file_system_type *type,
>  	hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
>  		bool locked;
>  
> -		sb->s_count++;
> +		if (!atomic_inc_not_zero(&sb->s_count))
> +			continue;
>  		spin_unlock(&sb_lock);
>  
>  		locked = super_lock_shared(sb);
> @@ -986,14 +989,14 @@ void iterate_supers_type(struct file_system_type *type,
>  			super_unlock_shared(sb);
>  		}
>  
> -		spin_lock(&sb_lock);
>  		if (p)
> -			__put_super(p);
> +			put_super(p);
>  		p = sb;
> +		spin_lock(&sb_lock);
>  	}
> -	if (p)
> -		__put_super(p);
>  	spin_unlock(&sb_lock);
> +	if (p)
> +		put_super(p);
>  }
>  
>  EXPORT_SYMBOL(iterate_supers_type);
> @@ -1007,7 +1010,8 @@ struct super_block *user_get_super(dev_t dev, bool excl)
>  		if (sb->s_dev ==  dev) {
>  			bool locked;
>  
> -			sb->s_count++;
> +			if (!atomic_inc_not_zero(&sb->s_count))
> +				continue;
>  			spin_unlock(&sb_lock);
>  			/* still alive? */
>  			locked = super_lock(sb, excl);
> @@ -1017,8 +1021,8 @@ struct super_block *user_get_super(dev_t dev, bool excl)
>  				super_unlock(sb, excl);
>  			}
>  			/* nope, got unmounted */
> +			put_super(sb);
>  			spin_lock(&sb_lock);
> -			__put_super(sb);
>  			break;
>  		}
>  	}
> @@ -1387,20 +1391,27 @@ static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
>  	__releases(&bdev->bd_holder_lock)
>  {
>  	struct super_block *sb = bdev->bd_holder;
> -	bool locked;
> +	bool active;
>  
>  	lockdep_assert_held(&bdev->bd_holder_lock);
>  	lockdep_assert_not_held(&sb->s_umount);
>  	lockdep_assert_not_held(&bdev->bd_disk->open_mutex);
>  
> -	/* Make sure sb doesn't go away from under us */
> -	spin_lock(&sb_lock);
> -	sb->s_count++;
> -	spin_unlock(&sb_lock);
> +	active = atomic_inc_not_zero(&sb->s_count);
>  
>  	mutex_unlock(&bdev->bd_holder_lock);
>  
> -	locked = super_lock(sb, excl);
> +	/*
> +	 * The bd_holder_lock guarantees that @sb is still valid.
> +	 * sb->s_count can't be zero. If it were it would mean that we
> +	 * found a block device that has bdev->bd_holder set to a
> +	 * superblock that's about to be freed. IOW, there's a UAF
> +	 * somewhere...
> +	 */
> +	if (WARN_ON_ONCE(!active))
> +		return NULL;
> +
> +	active = super_lock(sb, excl);
>  
>  	/*
>  	 * If the superblock wasn't already SB_DYING then we hold
> @@ -1408,7 +1419,7 @@ static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
>           */
>  	put_super(sb);
>  
> -	if (!locked)
> +	if (!active)
>  		return NULL;
>  
>  	if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 5174e821d451..68e453c155af 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1201,7 +1201,7 @@ struct super_block {
>  	unsigned long		s_magic;
>  	struct dentry		*s_root;
>  	struct rw_semaphore	s_umount;
> -	int			s_count;
> +	atomic_t		s_count;
>  	atomic_t		s_active;
>  #ifdef CONFIG_SECURITY
>  	void                    *s_security;
> -- 
> 2.34.1
>
Christian Brauner Nov. 2, 2023, 4:05 p.m. UTC | #2
On Thu, Nov 02, 2023 at 02:48:42PM +0100, Jan Kara wrote:
> On Fri 27-10-23 11:35:20, Christian Brauner wrote:
> > So, I believe we can drop the sb_lock in bdev_super_lock() for all
> > holder operations if we turn s_count into an atomic. It will slightly
> > change semantics for list walks like iterate_supers() but imho that's
> > fine. It'll mean that list walkes need a acquire sb_lock, then try to
> > get reference via atomic_inc_not_zero().
> > 
> > The logic there is simply that if you still found the superblock on the
> > list then yes, someone could now concurrently drop s_count to zero
> > behind your back. But because you hold sb_lock they can't remove it from
> > the list behind your back.
> > 
> > So if you now fail atomic_inc_not_zero() then you know that someone
> > concurrently managed to drop the ref to zero and wants to remove that sb
> > from the list. So you just ignore that super block and go on walking the
> > list. If however, you manage to get an active reference things are fine
> > and you can try to trade that temporary reference for an active
> > reference. So my theory at least...
> > 
> > Yes, ofc we add atomics but for superblocks we shouldn't care especially
> > we have less and less list walkers. Both get_super() and
> > get_active_super() are gone after all.
> > 
> > I'm running xfstests as I'm sending this and I need to start finishing
> > PRs so in RFC mode. Thoughts?
> > 
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> 
> So in principle I agree we can get rid of some sb_lock use if we convert
> sb->s_count to an atomic type. However does this bring any significant
> benefit? I would not expect sb_lock to be contended and as you say you need
> to be more careful about the races then so that is a reason against such
> change.

I like that we elide the spinlock in cases where we go from block device
to superblock. It's more elegant. But in practice I think it won't
matter. It's not that bdev_*() calls are extremely performant.
Christoph Hellwig Nov. 3, 2023, 8:19 a.m. UTC | #3
Same feeling as Jan here - this looks fine to me, but I wonder if there's
much of a need.  Maybe run it past Al if he has any opinion?
Al Viro Nov. 6, 2023, 6:08 p.m. UTC | #4
On Fri, Nov 03, 2023 at 09:19:08AM +0100, Christoph Hellwig wrote:
> Same feeling as Jan here - this looks fine to me, but I wonder if there's
> much of a need.  Maybe run it past Al if he has any opinion?

[resurfaces from dcache stuff]

TBH, I'd rather see documentation of struct super_block life cycle
rules written up, just to see what ends up being too ugly to document ;-/
I have old notes on that stuff, but they are pretty much invalidated by
the rework that happened this summer...

I don't hate making ->s_count atomic, but short of real evidence that
sb_lock gets serious contention, I don't see much point either way.

PS: Re dcache - I've a growing branch with a bunch of massage in that area,
plus the local attempt at documentation that will go there.  How are we
going to manage the trees?  The coming cycle I'm probably back to normal
amount of activity; the summer had been a fucking nightmare, but the things
have settled down by now...  <looks> at least 5 topical branches, just
going by what I've got at the moment.
Christian Brauner Nov. 7, 2023, 3:01 p.m. UTC | #5
On Mon, Nov 06, 2023 at 06:08:31PM +0000, Al Viro wrote:
> On Fri, Nov 03, 2023 at 09:19:08AM +0100, Christoph Hellwig wrote:
> > Same feeling as Jan here - this looks fine to me, but I wonder if there's
> > much of a need.  Maybe run it past Al if he has any opinion?
> 
> [resurfaces from dcache stuff]
> 
> TBH, I'd rather see documentation of struct super_block life cycle
> rules written up, just to see what ends up being too ugly to document ;-/
> I have old notes on that stuff, but they are pretty much invalidated by
> the rework that happened this summer...

So I can write up an even more detailed document but
Documentation/filesystems/porting.rst upstream summarizes what has been
done in some detail.

> I don't hate making ->s_count atomic, but short of real evidence that
> sb_lock gets serious contention, I don't see much point either way.

Doesn't really matter enough imho.

> 
> PS: Re dcache - I've a growing branch with a bunch of massage in that area,
> plus the local attempt at documentation that will go there.  How are we
> going to manage the trees?  The coming cycle I'm probably back to normal
> amount of activity; the summer had been a fucking nightmare, but the things
> have settled down by now...  <looks> at least 5 topical branches, just
> going by what I've got at the moment.

One option is that I pull dcache branches from you and merge them into
vfs.all. I can also send them to Linus but I understand if you prefer to
do this yourself.

The other options is that you just do what you do in your tree and we'll
just deal with merge conflicts in next.

Fwiw, I haven't applied a lot of dcache stuff or taken patches of yours
you've Cced me on recently because I wasn't sure whether that was what
you wanted. I'm happy to pick them up ofc.
diff mbox series

Patch

diff --git a/fs/super.c b/fs/super.c
index 71e5e61cfc9e..c58de6bb5633 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -375,7 +375,7 @@  static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	INIT_LIST_HEAD(&s->s_inodes_wb);
 	spin_lock_init(&s->s_inode_wblist_lock);
 
-	s->s_count = 1;
+	atomic_set(&s->s_count, 1);
 	atomic_set(&s->s_active, 1);
 	mutex_init(&s->s_vfs_rename_mutex);
 	lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
@@ -409,19 +409,6 @@  static struct super_block *alloc_super(struct file_system_type *type, int flags,
 /*
  * Drop a superblock's refcount.  The caller must hold sb_lock.
  */
-static void __put_super(struct super_block *s)
-{
-	if (!--s->s_count) {
-		list_del_init(&s->s_list);
-		WARN_ON(s->s_dentry_lru.node);
-		WARN_ON(s->s_inode_lru.node);
-		WARN_ON(!list_empty(&s->s_mounts));
-		security_sb_free(s);
-		put_user_ns(s->s_user_ns);
-		kfree(s->s_subtype);
-		call_rcu(&s->rcu, destroy_super_rcu);
-	}
-}
 
 /**
  *	put_super	-	drop a temporary reference to superblock
@@ -430,10 +417,20 @@  static void __put_super(struct super_block *s)
  *	Drops a temporary reference, frees superblock if there's no
  *	references left.
  */
-void put_super(struct super_block *sb)
+void put_super(struct super_block *s)
 {
+	if (!atomic_dec_and_test(&s->s_count))
+		return;
+
 	spin_lock(&sb_lock);
-	__put_super(sb);
+	list_del_init(&s->s_list);
+	WARN_ON(s->s_dentry_lru.node);
+	WARN_ON(s->s_inode_lru.node);
+	WARN_ON(!list_empty(&s->s_mounts));
+	security_sb_free(s);
+	put_user_ns(s->s_user_ns);
+	kfree(s->s_subtype);
+	call_rcu(&s->rcu, destroy_super_rcu);
 	spin_unlock(&sb_lock);
 }
 
@@ -548,8 +545,11 @@  static bool grab_super(struct super_block *sb)
 {
 	bool locked;
 
-	sb->s_count++;
+	locked = atomic_inc_not_zero(&sb->s_count);
 	spin_unlock(&sb_lock);
+	if (!locked)
+		return false;
+
 	locked = super_lock_excl(sb);
 	if (locked) {
 		if (atomic_inc_not_zero(&sb->s_active)) {
@@ -908,19 +908,20 @@  static void __iterate_supers(void (*f)(struct super_block *))
 		/* Pairs with memory marrier in super_wake(). */
 		if (smp_load_acquire(&sb->s_flags) & SB_DYING)
 			continue;
-		sb->s_count++;
+		if (!atomic_inc_not_zero(&sb->s_count))
+			continue;
 		spin_unlock(&sb_lock);
 
 		f(sb);
 
-		spin_lock(&sb_lock);
 		if (p)
-			__put_super(p);
+			put_super(p);
 		p = sb;
+		spin_lock(&sb_lock);
 	}
-	if (p)
-		__put_super(p);
 	spin_unlock(&sb_lock);
+	if (p)
+		put_super(p);
 }
 /**
  *	iterate_supers - call function for all active superblocks
@@ -938,7 +939,8 @@  void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
 	list_for_each_entry(sb, &super_blocks, s_list) {
 		bool locked;
 
-		sb->s_count++;
+		if (!atomic_inc_not_zero(&sb->s_count))
+			continue;
 		spin_unlock(&sb_lock);
 
 		locked = super_lock_shared(sb);
@@ -948,14 +950,14 @@  void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
 			super_unlock_shared(sb);
 		}
 
-		spin_lock(&sb_lock);
 		if (p)
-			__put_super(p);
+			put_super(p);
 		p = sb;
+		spin_lock(&sb_lock);
 	}
-	if (p)
-		__put_super(p);
 	spin_unlock(&sb_lock);
+	if (p)
+		put_super(p);
 }
 
 /**
@@ -976,7 +978,8 @@  void iterate_supers_type(struct file_system_type *type,
 	hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
 		bool locked;
 
-		sb->s_count++;
+		if (!atomic_inc_not_zero(&sb->s_count))
+			continue;
 		spin_unlock(&sb_lock);
 
 		locked = super_lock_shared(sb);
@@ -986,14 +989,14 @@  void iterate_supers_type(struct file_system_type *type,
 			super_unlock_shared(sb);
 		}
 
-		spin_lock(&sb_lock);
 		if (p)
-			__put_super(p);
+			put_super(p);
 		p = sb;
+		spin_lock(&sb_lock);
 	}
-	if (p)
-		__put_super(p);
 	spin_unlock(&sb_lock);
+	if (p)
+		put_super(p);
 }
 
 EXPORT_SYMBOL(iterate_supers_type);
@@ -1007,7 +1010,8 @@  struct super_block *user_get_super(dev_t dev, bool excl)
 		if (sb->s_dev ==  dev) {
 			bool locked;
 
-			sb->s_count++;
+			if (!atomic_inc_not_zero(&sb->s_count))
+				continue;
 			spin_unlock(&sb_lock);
 			/* still alive? */
 			locked = super_lock(sb, excl);
@@ -1017,8 +1021,8 @@  struct super_block *user_get_super(dev_t dev, bool excl)
 				super_unlock(sb, excl);
 			}
 			/* nope, got unmounted */
+			put_super(sb);
 			spin_lock(&sb_lock);
-			__put_super(sb);
 			break;
 		}
 	}
@@ -1387,20 +1391,27 @@  static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
 	__releases(&bdev->bd_holder_lock)
 {
 	struct super_block *sb = bdev->bd_holder;
-	bool locked;
+	bool active;
 
 	lockdep_assert_held(&bdev->bd_holder_lock);
 	lockdep_assert_not_held(&sb->s_umount);
 	lockdep_assert_not_held(&bdev->bd_disk->open_mutex);
 
-	/* Make sure sb doesn't go away from under us */
-	spin_lock(&sb_lock);
-	sb->s_count++;
-	spin_unlock(&sb_lock);
+	active = atomic_inc_not_zero(&sb->s_count);
 
 	mutex_unlock(&bdev->bd_holder_lock);
 
-	locked = super_lock(sb, excl);
+	/*
+	 * The bd_holder_lock guarantees that @sb is still valid.
+	 * sb->s_count can't be zero. If it were it would mean that we
+	 * found a block device that has bdev->bd_holder set to a
+	 * superblock that's about to be freed. IOW, there's a UAF
+	 * somewhere...
+	 */
+	if (WARN_ON_ONCE(!active))
+		return NULL;
+
+	active = super_lock(sb, excl);
 
 	/*
 	 * If the superblock wasn't already SB_DYING then we hold
@@ -1408,7 +1419,7 @@  static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
          */
 	put_super(sb);
 
-	if (!locked)
+	if (!active)
 		return NULL;
 
 	if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5174e821d451..68e453c155af 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1201,7 +1201,7 @@  struct super_block {
 	unsigned long		s_magic;
 	struct dentry		*s_root;
 	struct rw_semaphore	s_umount;
-	int			s_count;
+	atomic_t		s_count;
 	atomic_t		s_active;
 #ifdef CONFIG_SECURITY
 	void                    *s_security;