diff mbox

fs: avoid locking sb_lock in grab_super_passive()

Message ID 20150219171934.20458.30175.stgit@buzz (mailing list archive)
State New, archived
Headers show

Commit Message

Konstantin Khlebnikov Feb. 19, 2015, 5:19 p.m. UTC
I've noticed significant locking contention in memory reclaimer around
sb_lock inside grab_super_passive(). Grab_super_passive() is called from
two places: in icache/dcache shrinkers (function super_cache_scan) and
from writeback (function __writeback_inodes_wb). Both are required for
progress in memory reclaimer.

Also this lock isn't irq-safe. And I've seen suspicious livelock under
serious memory pressure where reclaimer was called from interrupt which
have happened right in place where sb_lock is held in normal context,
so all other cpus were stuck on that lock too.

Grab_super_passive() acquires sb_lock to increment sb->s_count and check
sb->s_instances. It seems sb->s_umount locked for read is enough here:
super-block deactivation always runs under sb->s_umount locked for write.
Protecting super-block itself isn't a problem: in super_cache_scan() sb
is protected by shrinker_rwsem: it cannot be freed if its slab shrinkers
are still active. Inside writeback super-block comes from inode from bdi
writeback list under wb->list_lock.

This patch removes locking sb_lock and checks s_instances under s_umount:
generic_shutdown_super() unlinks it under sb->s_umount locked for write.
Now successful grab_super_passive() only locks semaphore, callers must
call up_read(&sb->s_umount) instead of drop_super(sb) when they're done.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
 fs/fs-writeback.c |    2 +-
 fs/super.c        |   18 ++++--------------
 2 files changed, 5 insertions(+), 15 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Konstantin Khlebnikov Feb. 19, 2015, 9:06 p.m. UTC | #1
On Thu, Feb 19, 2015 at 8:19 PM, Konstantin Khlebnikov
<khlebnikov@yandex-team.ru> wrote:
> I've noticed significant locking contention in memory reclaimer around
> sb_lock inside grab_super_passive(). Grab_super_passive() is called from
> two places: in icache/dcache shrinkers (function super_cache_scan) and
> from writeback (function __writeback_inodes_wb). Both are required for
> progress in memory reclaimer.
>
> Also this lock isn't irq-safe. And I've seen suspicious livelock under
> serious memory pressure where reclaimer was called from interrupt which

s/reclaimer/allocator/

> have happened right in place where sb_lock is held in normal context,
> so all other cpus were stuck on that lock too.
>
> Grab_super_passive() acquires sb_lock to increment sb->s_count and check
> sb->s_instances. It seems sb->s_umount locked for read is enough here:
> super-block deactivation always runs under sb->s_umount locked for write.
> Protecting super-block itself isn't a problem: in super_cache_scan() sb
> is protected by shrinker_rwsem: it cannot be freed if its slab shrinkers
> are still active. Inside writeback super-block comes from inode from bdi
> writeback list under wb->list_lock.
>
> This patch removes locking sb_lock and checks s_instances under s_umount:
> generic_shutdown_super() unlinks it under sb->s_umount locked for write.
> Now successful grab_super_passive() only locks semaphore, callers must
> call up_read(&sb->s_umount) instead of drop_super(sb) when they're done.
>
> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
> ---
>  fs/fs-writeback.c |    2 +-
>  fs/super.c        |   18 ++++--------------
>  2 files changed, 5 insertions(+), 15 deletions(-)
>
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 073657f..3e92bb7 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -779,7 +779,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
>                         continue;
>                 }
>                 wrote += writeback_sb_inodes(sb, wb, work);
> -               drop_super(sb);
> +               up_read(&sb->s_umount);
>
>                 /* refer to the same tests at the end of writeback_sb_inodes */
>                 if (wrote) {
> diff --git a/fs/super.c b/fs/super.c
> index 65a53ef..6ae33ed 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -105,7 +105,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
>                 freed += sb->s_op->free_cached_objects(sb, sc);
>         }
>
> -       drop_super(sb);
> +       up_read(&sb->s_umount);
>         return freed;
>  }
>
> @@ -356,27 +356,17 @@ static int grab_super(struct super_block *s) __releases(sb_lock)
>   *     superblock does not go away while we are working on it. It returns
>   *     false if a reference was not gained, and returns true with the s_umount
>   *     lock held in read mode if a reference is gained. On successful return,
> - *     the caller must drop the s_umount lock and the passive reference when
> - *     done.
> + *     the caller must drop the s_umount lock when done.
>   */
>  bool grab_super_passive(struct super_block *sb)
>  {
> -       spin_lock(&sb_lock);
> -       if (hlist_unhashed(&sb->s_instances)) {
> -               spin_unlock(&sb_lock);
> -               return false;
> -       }
> -
> -       sb->s_count++;
> -       spin_unlock(&sb_lock);
> -
>         if (down_read_trylock(&sb->s_umount)) {
> -               if (sb->s_root && (sb->s_flags & MS_BORN))
> +               if (!hlist_unhashed(&sb->s_instances) &&
> +                   sb->s_root && (sb->s_flags & MS_BORN))
>                         return true;
>                 up_read(&sb->s_umount);
>         }
>
> -       put_super(sb);
>         return false;
>  }
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Morton Feb. 20, 2015, 11:07 p.m. UTC | #2
On Thu, 19 Feb 2015 20:19:35 +0300 Konstantin Khlebnikov <khlebnikov@yandex-team.ru> wrote:
>

Please cc Dave Chinner on this.

> I've noticed significant locking contention in memory reclaimer around
> sb_lock inside grab_super_passive(). Grab_super_passive() is called from
> two places: in icache/dcache shrinkers (function super_cache_scan) and
> from writeback (function __writeback_inodes_wb). Both are required for
> progress in memory reclaimer.
> 
> Also this lock isn't irq-safe. And I've seen suspicious livelock under
> serious memory pressure where reclaimer was called from interrupt which
> have happened right in place where sb_lock is held in normal context,
> so all other cpus were stuck on that lock too.

You mean someone is calling grab_super_passive() (ie: fs writeback)
from interrupt context?  What's the call path?

> Grab_super_passive() acquires sb_lock to increment sb->s_count and check
> sb->s_instances. It seems sb->s_umount locked for read is enough here:
> super-block deactivation always runs under sb->s_umount locked for write.
> Protecting super-block itself isn't a problem: in super_cache_scan() sb
> is protected by shrinker_rwsem: it cannot be freed if its slab shrinkers
> are still active. Inside writeback super-block comes from inode from bdi
> writeback list under wb->list_lock.
> 
> This patch removes locking sb_lock and checks s_instances under s_umount:
> generic_shutdown_super() unlinks it under sb->s_umount locked for write.
> Now successful grab_super_passive() only locks semaphore, callers must
> call up_read(&sb->s_umount) instead of drop_super(sb) when they're done.
> 

The patch looks reasonable to me, but the grab_super_passive()
documentation needs further updating, please.

- It no longer "acquires a reference".  All it does is to acquire an rwsem.

- What the heck is a "passive reference" anyway?  It appears to be
  the situation where we increment s_count without incrementing s_active.

  After your patch, this superblock state no longer exists(?), so
  perhaps the entire "passive reference" concept and any references to
  it can be expunged from the kernel.

  And grab_super_passive() should be renamed anyway.  It no longer
  "grabs" anything - it attempts to acquire ->s_umount. 
  "super_trylock", maybe?

- While we're dicking with the grab_super_passive() documentation,
  let's turn it into kerneldoc by adding the /**.  
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Feb. 20, 2015, 11:50 p.m. UTC | #3
On Fri, Feb 20, 2015 at 03:07:31PM -0800, Andrew Morton wrote:

> - It no longer "acquires a reference".  All it does is to acquire an rwsem.
> 
> - What the heck is a "passive reference" anyway?  It appears to be
>   the situation where we increment s_count without incrementing s_active.

Reference to struct super_block that guarantees only that its memory won't
be freed until we drop it.

>   After your patch, this superblock state no longer exists(?),

Yes, it does.  The _only_ reason why that patch isn't outright bogus is that
we do only down_read_trylock() on ->s_umount - try to pull off the same thing
with down_read() and you'll get a nasty race.  Take a look at e.g.
get_super().  Or user_get_super().  Or iterate_supers()/iterate_supers_type(),
where we don't return such references, but pass them to a callback instead.
In all those cases we end up with passive reference taken, ->s_umount
taken shared (_NOT_ with trylock) and fs checked for being still alive.
Then it's guaranteed to stay alive until we do drop_super().

I agree that the name blows, BTW - something like try_get_super() might have
been more descriptive, but with this change it actually becomes a bad name
as well, since after it we need a different way to release the obtained ref;
not the same as after get_super().  Your variant might be OK, but I'd
probably make it trylock_super(), to match the verb-object order of the
rest of identifiers in that area...

> so
>   perhaps the entire "passive reference" concept and any references to
>   it can be expunged from the kernel.

Nope.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Feb. 21, 2015, 2:37 a.m. UTC | #4
On Thu, Feb 19, 2015 at 08:19:35PM +0300, Konstantin Khlebnikov wrote:
> I've noticed significant locking contention in memory reclaimer around
> sb_lock inside grab_super_passive(). Grab_super_passive() is called from
> two places: in icache/dcache shrinkers (function super_cache_scan) and
> from writeback (function __writeback_inodes_wb). Both are required for
> progress in memory reclaimer.
> 
> Also this lock isn't irq-safe. And I've seen suspicious livelock under
> serious memory pressure where reclaimer was called from interrupt which
> have happened right in place where sb_lock is held in normal context,
> so all other cpus were stuck on that lock too.

Excuse me, but this part is BS - its call is immediately preceded by
        if (!(sc->gfp_mask & __GFP_FS))
                return SHRINK_STOP;
and if we *ever* hit GFP_FS allocation from interrupt, we are really
screwed.  If nothing else, both prune_dcache_sb() and prune_icache_sb()
can wait for all kinds of IO; you really don't want that called in an
interrupt context.  The same goes for writeback_sb_inodes(), while we
are at it.

If you ever see that in an interrupt context, you have a very bad problem
on hands.

Said that, not bothering with sb_lock (and ->s_count) in those two callers
makes sense.  Applied, with name changed to trylock_super().
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Konstantin Khlebnikov Feb. 24, 2015, 9:19 a.m. UTC | #5
On Sat, Feb 21, 2015 at 5:37 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Thu, Feb 19, 2015 at 08:19:35PM +0300, Konstantin Khlebnikov wrote:
>> I've noticed significant locking contention in memory reclaimer around
>> sb_lock inside grab_super_passive(). Grab_super_passive() is called from
>> two places: in icache/dcache shrinkers (function super_cache_scan) and
>> from writeback (function __writeback_inodes_wb). Both are required for
>> progress in memory reclaimer.
>>
>> Also this lock isn't irq-safe. And I've seen suspicious livelock under
>> serious memory pressure where reclaimer was called from interrupt which
>> have happened right in place where sb_lock is held in normal context,
>> so all other cpus were stuck on that lock too.
>
> Excuse me, but this part is BS - its call is immediately preceded by
>         if (!(sc->gfp_mask & __GFP_FS))
>                 return SHRINK_STOP;
> and if we *ever* hit GFP_FS allocation from interrupt, we are really
> screwed.  If nothing else, both prune_dcache_sb() and prune_icache_sb()
> can wait for all kinds of IO; you really don't want that called in an
> interrupt context.  The same goes for writeback_sb_inodes(), while we
> are at it.
>
> If you ever see that in an interrupt context, you have a very bad problem
> on hands.
>
> Said that, not bothering with sb_lock (and ->s_count) in those two callers
> makes sense.  Applied, with name changed to trylock_super().

Ok, thanks. I'll pull this into our kernel and try to catch livelock again.

It seems sb_lock becomes hottest lock by accident: system has no swap
and all page-cache is gone thus all cpus stuck at reclaiming inodes and
dentries. For some reason OOM killer wasn't invoked for hour or so.

Part about reclaimer called from interrupt context was BS for sure
I've mixed up some stacks from that 30Mb log of kernel's suffering.

>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Konstantin Khlebnikov Feb. 24, 2015, 10:41 a.m. UTC | #6
On 21.02.2015 02:50, Al Viro wrote:
> On Fri, Feb 20, 2015 at 03:07:31PM -0800, Andrew Morton wrote:
>
>> - It no longer "acquires a reference".  All it does is to acquire an rwsem.
>>
>> - What the heck is a "passive reference" anyway?  It appears to be
>>    the situation where we increment s_count without incrementing s_active.
>
> Reference to struct super_block that guarantees only that its memory won't
> be freed until we drop it.
>
>>    After your patch, this superblock state no longer exists(?),
>
> Yes, it does.  The _only_ reason why that patch isn't outright bogus is that
> we do only down_read_trylock() on ->s_umount - try to pull off the same thing
> with down_read() and you'll get a nasty race.

I don't get this. What the problem with down_read(sb->s_umount)?

For grab_super_passive()/trylock_super() caller guarantees memory
wouldn't be freed and we check tsb activeness after grabbing shared
lock. And while we hold that lock it'll stay active.

It have to use down_read_trylock() just because it works in in atomic
context when writeback calls it. No?

Check for activeness actually is a quite confusing.
It seems checking for MS_BORN and MS_ACTIVE should be enough:

  bool trylock_super(struct super_block *sb)
  {
         if (down_read_trylock(&sb->s_umount)) {
-               if (!hlist_unhashed(&sb->s_instances) &&
-                   sb->s_root && (sb->s_flags & MS_BORN))
+               if ((sb->s_flags & MS_BORN) && (sb->s_flags & MS_ACTIVE))
                         return true;
                 up_read(&sb->s_umount);
         }

> Take a look at e.g.
> get_super().  Or user_get_super().  Or iterate_supers()/iterate_supers_type(),
> where we don't return such references, but pass them to a callback instead.
> In all those cases we end up with passive reference taken, ->s_umount
> taken shared (_NOT_ with trylock) and fs checked for being still alive.
> Then it's guaranteed to stay alive until we do drop_super().
>
> I agree that the name blows, BTW - something like try_get_super() might have
> been more descriptive, but with this change it actually becomes a bad name
> as well, since after it we need a different way to release the obtained ref;
> not the same as after get_super().  Your variant might be OK, but I'd
> probably make it trylock_super(), to match the verb-object order of the
> rest of identifiers in that area...
>
>> so
>>    perhaps the entire "passive reference" concept and any references to
>>    it can be expunged from the kernel.
>
> Nope.
>
diff mbox

Patch

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 073657f..3e92bb7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -779,7 +779,7 @@  static long __writeback_inodes_wb(struct bdi_writeback *wb,
 			continue;
 		}
 		wrote += writeback_sb_inodes(sb, wb, work);
-		drop_super(sb);
+		up_read(&sb->s_umount);
 
 		/* refer to the same tests at the end of writeback_sb_inodes */
 		if (wrote) {
diff --git a/fs/super.c b/fs/super.c
index 65a53ef..6ae33ed 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -105,7 +105,7 @@  static unsigned long super_cache_scan(struct shrinker *shrink,
 		freed += sb->s_op->free_cached_objects(sb, sc);
 	}
 
-	drop_super(sb);
+	up_read(&sb->s_umount);
 	return freed;
 }
 
@@ -356,27 +356,17 @@  static int grab_super(struct super_block *s) __releases(sb_lock)
  *	superblock does not go away while we are working on it. It returns
  *	false if a reference was not gained, and returns true with the s_umount
  *	lock held in read mode if a reference is gained. On successful return,
- *	the caller must drop the s_umount lock and the passive reference when
- *	done.
+ *	the caller must drop the s_umount lock when done.
  */
 bool grab_super_passive(struct super_block *sb)
 {
-	spin_lock(&sb_lock);
-	if (hlist_unhashed(&sb->s_instances)) {
-		spin_unlock(&sb_lock);
-		return false;
-	}
-
-	sb->s_count++;
-	spin_unlock(&sb_lock);
-
 	if (down_read_trylock(&sb->s_umount)) {
-		if (sb->s_root && (sb->s_flags & MS_BORN))
+		if (!hlist_unhashed(&sb->s_instances) &&
+		    sb->s_root && (sb->s_flags & MS_BORN))
 			return true;
 		up_read(&sb->s_umount);
 	}
 
-	put_super(sb);
 	return false;
 }