diff mbox series

[v2,2/7] mm: vmscan: make global slab shrink lockless

Message ID 20230223132725.11685-3-zhengqi.arch@bytedance.com (mailing list archive)
State New
Headers show
Series make slab shrink lockless | expand

Commit Message

Qi Zheng Feb. 23, 2023, 1:27 p.m. UTC
The shrinker_rwsem is a global lock in shrinkers subsystem,
it is easy to cause blocking in the following cases:

a. the write lock of shrinker_rwsem was held for too long.
   For example, there are many memcgs in the system, which
   causes some paths to hold locks and traverse it for too
   long. (e.g. expand_shrinker_info())
b. the read lock of shrinker_rwsem was held for too long,
   and a writer came at this time. Then this writer will be
   forced to wait and block all subsequent readers.
   For example:
   - be scheduled when the read lock of shrinker_rwsem is
     held in do_shrink_slab()
   - some shrinker are blocked for too long. Like the case
     mentioned in the patchset[1].

Therefore, many times in history ([2],[3],[4],[5]), some
people wanted to replace shrinker_rwsem reader with SRCU,
but they all gave up because SRCU was not unconditionally
enabled.

But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
the SRCU is unconditionally enabled. So it's time to use
SRCU to protect readers who previously held shrinker_rwsem.

[1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
[2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
[3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
[4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
[5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 mm/vmscan.c | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

Comments

Rafael Aquini Feb. 23, 2023, 3:26 p.m. UTC | #1
On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
> The shrinker_rwsem is a global lock in shrinkers subsystem,
> it is easy to cause blocking in the following cases:
> 
> a. the write lock of shrinker_rwsem was held for too long.
>    For example, there are many memcgs in the system, which
>    causes some paths to hold locks and traverse it for too
>    long. (e.g. expand_shrinker_info())
> b. the read lock of shrinker_rwsem was held for too long,
>    and a writer came at this time. Then this writer will be
>    forced to wait and block all subsequent readers.
>    For example:
>    - be scheduled when the read lock of shrinker_rwsem is
>      held in do_shrink_slab()
>    - some shrinker are blocked for too long. Like the case
>      mentioned in the patchset[1].
> 
> Therefore, many times in history ([2],[3],[4],[5]), some
> people wanted to replace shrinker_rwsem reader with SRCU,
> but they all gave up because SRCU was not unconditionally
> enabled.
> 
> But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
> the SRCU is unconditionally enabled. So it's time to use
> SRCU to protect readers who previously held shrinker_rwsem.
> 
> [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
> [2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
> [3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
> [4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
> [5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
> 
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
>  mm/vmscan.c | 27 +++++++++++----------------
>  1 file changed, 11 insertions(+), 16 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 9f895ca6216c..02987a6f95d1 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>  
>  LIST_HEAD(shrinker_list);
>  DECLARE_RWSEM(shrinker_rwsem);
> +DEFINE_SRCU(shrinker_srcu);
>  
>  #ifdef CONFIG_MEMCG
>  static int shrinker_nr_max;
> @@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
>  void register_shrinker_prepared(struct shrinker *shrinker)
>  {
>  	down_write(&shrinker_rwsem);

I think you could revert the rwsem back to a simple mutex, now.

> -	list_add_tail(&shrinker->list, &shrinker_list);
> +	list_add_tail_rcu(&shrinker->list, &shrinker_list);
>  	shrinker->flags |= SHRINKER_REGISTERED;
>  	shrinker_debugfs_add(shrinker);
>  	up_write(&shrinker_rwsem);
> @@ -760,13 +761,15 @@ void unregister_shrinker(struct shrinker *shrinker)
>  		return;
>  
>  	down_write(&shrinker_rwsem);
> -	list_del(&shrinker->list);
> +	list_del_rcu(&shrinker->list);
>  	shrinker->flags &= ~SHRINKER_REGISTERED;
>  	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
>  		unregister_memcg_shrinker(shrinker);
>  	debugfs_entry = shrinker_debugfs_remove(shrinker);
>  	up_write(&shrinker_rwsem);
>  
> +	synchronize_srcu(&shrinker_srcu);
> +
>  	debugfs_remove_recursive(debugfs_entry);
>  
>  	kfree(shrinker->nr_deferred);
> @@ -786,6 +789,7 @@ void synchronize_shrinkers(void)
>  {
>  	down_write(&shrinker_rwsem);
>  	up_write(&shrinker_rwsem);
> +	synchronize_srcu(&shrinker_srcu);
>  }
>  EXPORT_SYMBOL(synchronize_shrinkers);
>  
> @@ -996,6 +1000,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>  {
>  	unsigned long ret, freed = 0;
>  	struct shrinker *shrinker;
> +	int srcu_idx;
>  
>  	/*
>  	 * The root memcg might be allocated even though memcg is disabled
> @@ -1007,10 +1012,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>  	if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>  		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>  
> -	if (!down_read_trylock(&shrinker_rwsem))
> -		goto out;
> +	srcu_idx = srcu_read_lock(&shrinker_srcu);
>  
> -	list_for_each_entry(shrinker, &shrinker_list, list) {
> +	list_for_each_entry_srcu(shrinker, &shrinker_list, list,
> +				 srcu_read_lock_held(&shrinker_srcu)) {
>  		struct shrink_control sc = {
>  			.gfp_mask = gfp_mask,
>  			.nid = nid,
> @@ -1021,19 +1026,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>  		if (ret == SHRINK_EMPTY)
>  			ret = 0;
>  		freed += ret;
> -		/*
> -		 * Bail out if someone want to register a new shrinker to
> -		 * prevent the registration from being stalled for long periods
> -		 * by parallel ongoing shrinking.
> -		 */
> -		if (rwsem_is_contended(&shrinker_rwsem)) {
> -			freed = freed ? : 1;
> -			break;
> -		}
>  	}
>  
> -	up_read(&shrinker_rwsem);
> -out:
> +	srcu_read_unlock(&shrinker_srcu, srcu_idx);
>  	cond_resched();
>  	return freed;
>  }
> -- 
> 2.20.1
> 
>
Rafael Aquini Feb. 23, 2023, 3:37 p.m. UTC | #2
On Thu, Feb 23, 2023 at 10:26:45AM -0500, Rafael Aquini wrote:
> On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
> > The shrinker_rwsem is a global lock in shrinkers subsystem,
> > it is easy to cause blocking in the following cases:
> > 
> > a. the write lock of shrinker_rwsem was held for too long.
> >    For example, there are many memcgs in the system, which
> >    causes some paths to hold locks and traverse it for too
> >    long. (e.g. expand_shrinker_info())
> > b. the read lock of shrinker_rwsem was held for too long,
> >    and a writer came at this time. Then this writer will be
> >    forced to wait and block all subsequent readers.
> >    For example:
> >    - be scheduled when the read lock of shrinker_rwsem is
> >      held in do_shrink_slab()
> >    - some shrinker are blocked for too long. Like the case
> >      mentioned in the patchset[1].
> > 
> > Therefore, many times in history ([2],[3],[4],[5]), some
> > people wanted to replace shrinker_rwsem reader with SRCU,
> > but they all gave up because SRCU was not unconditionally
> > enabled.
> > 
> > But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
> > the SRCU is unconditionally enabled. So it's time to use
> > SRCU to protect readers who previously held shrinker_rwsem.
> > 
> > [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
> > [2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
> > [3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
> > [4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
> > [5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
> > 
> > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> > ---
> >  mm/vmscan.c | 27 +++++++++++----------------
> >  1 file changed, 11 insertions(+), 16 deletions(-)
> > 
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 9f895ca6216c..02987a6f95d1 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
> >  
> >  LIST_HEAD(shrinker_list);
> >  DECLARE_RWSEM(shrinker_rwsem);
> > +DEFINE_SRCU(shrinker_srcu);
> >  
> >  #ifdef CONFIG_MEMCG
> >  static int shrinker_nr_max;
> > @@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
> >  void register_shrinker_prepared(struct shrinker *shrinker)
> >  {
> >  	down_write(&shrinker_rwsem);
> 
> I think you could revert the rwsem back to a simple mutex, now.
>

NVM, that's exactly what patch 7 does. :)
Sultan Alsawaf (unemployed) Feb. 23, 2023, 6:24 p.m. UTC | #3
On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
> The shrinker_rwsem is a global lock in shrinkers subsystem,
> it is easy to cause blocking in the following cases:
> 
> a. the write lock of shrinker_rwsem was held for too long.
>    For example, there are many memcgs in the system, which
>    causes some paths to hold locks and traverse it for too
>    long. (e.g. expand_shrinker_info())
> b. the read lock of shrinker_rwsem was held for too long,
>    and a writer came at this time. Then this writer will be
>    forced to wait and block all subsequent readers.
>    For example:
>    - be scheduled when the read lock of shrinker_rwsem is
>      held in do_shrink_slab()
>    - some shrinker are blocked for too long. Like the case
>      mentioned in the patchset[1].
> 
> Therefore, many times in history ([2],[3],[4],[5]), some
> people wanted to replace shrinker_rwsem reader with SRCU,
> but they all gave up because SRCU was not unconditionally
> enabled.
> 
> But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
> the SRCU is unconditionally enabled. So it's time to use
> SRCU to protect readers who previously held shrinker_rwsem.
> 
> [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
> [2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
> [3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
> [4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
> [5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
> 
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
>  mm/vmscan.c | 27 +++++++++++----------------
>  1 file changed, 11 insertions(+), 16 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 9f895ca6216c..02987a6f95d1 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>  
>  LIST_HEAD(shrinker_list);
>  DECLARE_RWSEM(shrinker_rwsem);
> +DEFINE_SRCU(shrinker_srcu);
>  
>  #ifdef CONFIG_MEMCG
>  static int shrinker_nr_max;
> @@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
>  void register_shrinker_prepared(struct shrinker *shrinker)
>  {
>  	down_write(&shrinker_rwsem);
> -	list_add_tail(&shrinker->list, &shrinker_list);
> +	list_add_tail_rcu(&shrinker->list, &shrinker_list);
>  	shrinker->flags |= SHRINKER_REGISTERED;
>  	shrinker_debugfs_add(shrinker);
>  	up_write(&shrinker_rwsem);
> @@ -760,13 +761,15 @@ void unregister_shrinker(struct shrinker *shrinker)
>  		return;
>  
>  	down_write(&shrinker_rwsem);
> -	list_del(&shrinker->list);
> +	list_del_rcu(&shrinker->list);
>  	shrinker->flags &= ~SHRINKER_REGISTERED;
>  	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
>  		unregister_memcg_shrinker(shrinker);
>  	debugfs_entry = shrinker_debugfs_remove(shrinker);
>  	up_write(&shrinker_rwsem);
>  
> +	synchronize_srcu(&shrinker_srcu);
> +
>  	debugfs_remove_recursive(debugfs_entry);
>  
>  	kfree(shrinker->nr_deferred);
> @@ -786,6 +789,7 @@ void synchronize_shrinkers(void)
>  {
>  	down_write(&shrinker_rwsem);
>  	up_write(&shrinker_rwsem);
> +	synchronize_srcu(&shrinker_srcu);
>  }
>  EXPORT_SYMBOL(synchronize_shrinkers);
>  
> @@ -996,6 +1000,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>  {
>  	unsigned long ret, freed = 0;
>  	struct shrinker *shrinker;
> +	int srcu_idx;
>  
>  	/*
>  	 * The root memcg might be allocated even though memcg is disabled
> @@ -1007,10 +1012,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>  	if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>  		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>  
> -	if (!down_read_trylock(&shrinker_rwsem))
> -		goto out;
> +	srcu_idx = srcu_read_lock(&shrinker_srcu);
>  
> -	list_for_each_entry(shrinker, &shrinker_list, list) {
> +	list_for_each_entry_srcu(shrinker, &shrinker_list, list,
> +				 srcu_read_lock_held(&shrinker_srcu)) {
>  		struct shrink_control sc = {
>  			.gfp_mask = gfp_mask,
>  			.nid = nid,
> @@ -1021,19 +1026,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>  		if (ret == SHRINK_EMPTY)
>  			ret = 0;
>  		freed += ret;
> -		/*
> -		 * Bail out if someone want to register a new shrinker to
> -		 * prevent the registration from being stalled for long periods
> -		 * by parallel ongoing shrinking.
> -		 */
> -		if (rwsem_is_contended(&shrinker_rwsem)) {
> -			freed = freed ? : 1;
> -			break;
> -		}
>  	}
>  
> -	up_read(&shrinker_rwsem);
> -out:
> +	srcu_read_unlock(&shrinker_srcu, srcu_idx);
>  	cond_resched();
>  	return freed;
>  }
> -- 
> 2.20.1
> 
> 

Hi Qi,

A different problem I realized after my old attempt to use SRCU was that the
unregister_shrinker() path became quite slow due to the heavy synchronize_srcu()
call. Both register_shrinker() *and* unregister_shrinker() are called frequently
these days, and SRCU is too unfair to the unregister path IMO.

Although I never got around to submitting it, I made a non-SRCU solution [1]
that uses fine-grained locking instead, which is fair to both the register path
and unregister path. (The patch I've linked is a version of this adapted to an
older 4.14 kernel FYI, but it can be reworked for the current kernel.)

What do you think about the fine-grained locking approach?

Thanks,
Sultan

[1] https://github.com/kerneltoast/android_kernel_google_floral/commit/012378f3173a82d2333d3ae7326691544301e76a
Paul E. McKenney Feb. 23, 2023, 6:39 p.m. UTC | #4
On Thu, Feb 23, 2023 at 10:24:47AM -0800, Sultan Alsawaf wrote:
> On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
> > The shrinker_rwsem is a global lock in shrinkers subsystem,
> > it is easy to cause blocking in the following cases:
> > 
> > a. the write lock of shrinker_rwsem was held for too long.
> >    For example, there are many memcgs in the system, which
> >    causes some paths to hold locks and traverse it for too
> >    long. (e.g. expand_shrinker_info())
> > b. the read lock of shrinker_rwsem was held for too long,
> >    and a writer came at this time. Then this writer will be
> >    forced to wait and block all subsequent readers.
> >    For example:
> >    - be scheduled when the read lock of shrinker_rwsem is
> >      held in do_shrink_slab()
> >    - some shrinker are blocked for too long. Like the case
> >      mentioned in the patchset[1].
> > 
> > Therefore, many times in history ([2],[3],[4],[5]), some
> > people wanted to replace shrinker_rwsem reader with SRCU,
> > but they all gave up because SRCU was not unconditionally
> > enabled.
> > 
> > But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
> > the SRCU is unconditionally enabled. So it's time to use
> > SRCU to protect readers who previously held shrinker_rwsem.
> > 
> > [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
> > [2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
> > [3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
> > [4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
> > [5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
> > 
> > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> > ---
> >  mm/vmscan.c | 27 +++++++++++----------------
> >  1 file changed, 11 insertions(+), 16 deletions(-)
> > 
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 9f895ca6216c..02987a6f95d1 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
> >  
> >  LIST_HEAD(shrinker_list);
> >  DECLARE_RWSEM(shrinker_rwsem);
> > +DEFINE_SRCU(shrinker_srcu);
> >  
> >  #ifdef CONFIG_MEMCG
> >  static int shrinker_nr_max;
> > @@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
> >  void register_shrinker_prepared(struct shrinker *shrinker)
> >  {
> >  	down_write(&shrinker_rwsem);
> > -	list_add_tail(&shrinker->list, &shrinker_list);
> > +	list_add_tail_rcu(&shrinker->list, &shrinker_list);
> >  	shrinker->flags |= SHRINKER_REGISTERED;
> >  	shrinker_debugfs_add(shrinker);
> >  	up_write(&shrinker_rwsem);
> > @@ -760,13 +761,15 @@ void unregister_shrinker(struct shrinker *shrinker)
> >  		return;
> >  
> >  	down_write(&shrinker_rwsem);
> > -	list_del(&shrinker->list);
> > +	list_del_rcu(&shrinker->list);
> >  	shrinker->flags &= ~SHRINKER_REGISTERED;
> >  	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
> >  		unregister_memcg_shrinker(shrinker);
> >  	debugfs_entry = shrinker_debugfs_remove(shrinker);
> >  	up_write(&shrinker_rwsem);
> >  
> > +	synchronize_srcu(&shrinker_srcu);
> > +
> >  	debugfs_remove_recursive(debugfs_entry);
> >  
> >  	kfree(shrinker->nr_deferred);
> > @@ -786,6 +789,7 @@ void synchronize_shrinkers(void)
> >  {
> >  	down_write(&shrinker_rwsem);
> >  	up_write(&shrinker_rwsem);
> > +	synchronize_srcu(&shrinker_srcu);
> >  }
> >  EXPORT_SYMBOL(synchronize_shrinkers);
> >  
> > @@ -996,6 +1000,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
> >  {
> >  	unsigned long ret, freed = 0;
> >  	struct shrinker *shrinker;
> > +	int srcu_idx;
> >  
> >  	/*
> >  	 * The root memcg might be allocated even though memcg is disabled
> > @@ -1007,10 +1012,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
> >  	if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
> >  		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
> >  
> > -	if (!down_read_trylock(&shrinker_rwsem))
> > -		goto out;
> > +	srcu_idx = srcu_read_lock(&shrinker_srcu);
> >  
> > -	list_for_each_entry(shrinker, &shrinker_list, list) {
> > +	list_for_each_entry_srcu(shrinker, &shrinker_list, list,
> > +				 srcu_read_lock_held(&shrinker_srcu)) {
> >  		struct shrink_control sc = {
> >  			.gfp_mask = gfp_mask,
> >  			.nid = nid,
> > @@ -1021,19 +1026,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
> >  		if (ret == SHRINK_EMPTY)
> >  			ret = 0;
> >  		freed += ret;
> > -		/*
> > -		 * Bail out if someone want to register a new shrinker to
> > -		 * prevent the registration from being stalled for long periods
> > -		 * by parallel ongoing shrinking.
> > -		 */
> > -		if (rwsem_is_contended(&shrinker_rwsem)) {
> > -			freed = freed ? : 1;
> > -			break;
> > -		}
> >  	}
> >  
> > -	up_read(&shrinker_rwsem);
> > -out:
> > +	srcu_read_unlock(&shrinker_srcu, srcu_idx);
> >  	cond_resched();
> >  	return freed;
> >  }
> > -- 
> > 2.20.1
> > 
> > 
> 
> Hi Qi,
> 
> A different problem I realized after my old attempt to use SRCU was that the
> unregister_shrinker() path became quite slow due to the heavy synchronize_srcu()
> call. Both register_shrinker() *and* unregister_shrinker() are called frequently
> these days, and SRCU is too unfair to the unregister path IMO.
> 
> Although I never got around to submitting it, I made a non-SRCU solution [1]
> that uses fine-grained locking instead, which is fair to both the register path
> and unregister path. (The patch I've linked is a version of this adapted to an
> older 4.14 kernel FYI, but it can be reworked for the current kernel.)
> 
> What do you think about the fine-grained locking approach?

Another approach is to use synchronize_srcu_expedited(), which avoids
the sleeps that are otherwise used to encourage sharing of grace periods
among concurrent requests.  It might be possible to use call_srcu(),
but I don't claim to know the shrinker code well enough to say for sure.

							Thanx, Paul

> Thanks,
> Sultan
> 
> [1] https://github.com/kerneltoast/android_kernel_google_floral/commit/012378f3173a82d2333d3ae7326691544301e76a
Sultan Alsawaf (unemployed) Feb. 23, 2023, 7:18 p.m. UTC | #5
On Thu, Feb 23, 2023 at 10:39:17AM -0800, Paul E. McKenney wrote:
> On Thu, Feb 23, 2023 at 10:24:47AM -0800, Sultan Alsawaf wrote:
> > On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
> > > The shrinker_rwsem is a global lock in shrinkers subsystem,
> > > it is easy to cause blocking in the following cases:
> > > 
> > > a. the write lock of shrinker_rwsem was held for too long.
> > >    For example, there are many memcgs in the system, which
> > >    causes some paths to hold locks and traverse it for too
> > >    long. (e.g. expand_shrinker_info())
> > > b. the read lock of shrinker_rwsem was held for too long,
> > >    and a writer came at this time. Then this writer will be
> > >    forced to wait and block all subsequent readers.
> > >    For example:
> > >    - be scheduled when the read lock of shrinker_rwsem is
> > >      held in do_shrink_slab()
> > >    - some shrinker are blocked for too long. Like the case
> > >      mentioned in the patchset[1].
> > > 
> > > Therefore, many times in history ([2],[3],[4],[5]), some
> > > people wanted to replace shrinker_rwsem reader with SRCU,
> > > but they all gave up because SRCU was not unconditionally
> > > enabled.
> > > 
> > > But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
> > > the SRCU is unconditionally enabled. So it's time to use
> > > SRCU to protect readers who previously held shrinker_rwsem.
> > > 
> > > [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
> > > [2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
> > > [3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
> > > [4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
> > > [5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
> > > 
> > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> > > ---
> > >  mm/vmscan.c | 27 +++++++++++----------------
> > >  1 file changed, 11 insertions(+), 16 deletions(-)
> > > 
> > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > index 9f895ca6216c..02987a6f95d1 100644
> > > --- a/mm/vmscan.c
> > > +++ b/mm/vmscan.c
> > > @@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
> > >  
> > >  LIST_HEAD(shrinker_list);
> > >  DECLARE_RWSEM(shrinker_rwsem);
> > > +DEFINE_SRCU(shrinker_srcu);
> > >  
> > >  #ifdef CONFIG_MEMCG
> > >  static int shrinker_nr_max;
> > > @@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
> > >  void register_shrinker_prepared(struct shrinker *shrinker)
> > >  {
> > >  	down_write(&shrinker_rwsem);
> > > -	list_add_tail(&shrinker->list, &shrinker_list);
> > > +	list_add_tail_rcu(&shrinker->list, &shrinker_list);
> > >  	shrinker->flags |= SHRINKER_REGISTERED;
> > >  	shrinker_debugfs_add(shrinker);
> > >  	up_write(&shrinker_rwsem);
> > > @@ -760,13 +761,15 @@ void unregister_shrinker(struct shrinker *shrinker)
> > >  		return;
> > >  
> > >  	down_write(&shrinker_rwsem);
> > > -	list_del(&shrinker->list);
> > > +	list_del_rcu(&shrinker->list);
> > >  	shrinker->flags &= ~SHRINKER_REGISTERED;
> > >  	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
> > >  		unregister_memcg_shrinker(shrinker);
> > >  	debugfs_entry = shrinker_debugfs_remove(shrinker);
> > >  	up_write(&shrinker_rwsem);
> > >  
> > > +	synchronize_srcu(&shrinker_srcu);
> > > +
> > >  	debugfs_remove_recursive(debugfs_entry);
> > >  
> > >  	kfree(shrinker->nr_deferred);
> > > @@ -786,6 +789,7 @@ void synchronize_shrinkers(void)
> > >  {
> > >  	down_write(&shrinker_rwsem);
> > >  	up_write(&shrinker_rwsem);
> > > +	synchronize_srcu(&shrinker_srcu);
> > >  }
> > >  EXPORT_SYMBOL(synchronize_shrinkers);
> > >  
> > > @@ -996,6 +1000,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
> > >  {
> > >  	unsigned long ret, freed = 0;
> > >  	struct shrinker *shrinker;
> > > +	int srcu_idx;
> > >  
> > >  	/*
> > >  	 * The root memcg might be allocated even though memcg is disabled
> > > @@ -1007,10 +1012,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
> > >  	if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
> > >  		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
> > >  
> > > -	if (!down_read_trylock(&shrinker_rwsem))
> > > -		goto out;
> > > +	srcu_idx = srcu_read_lock(&shrinker_srcu);
> > >  
> > > -	list_for_each_entry(shrinker, &shrinker_list, list) {
> > > +	list_for_each_entry_srcu(shrinker, &shrinker_list, list,
> > > +				 srcu_read_lock_held(&shrinker_srcu)) {
> > >  		struct shrink_control sc = {
> > >  			.gfp_mask = gfp_mask,
> > >  			.nid = nid,
> > > @@ -1021,19 +1026,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
> > >  		if (ret == SHRINK_EMPTY)
> > >  			ret = 0;
> > >  		freed += ret;
> > > -		/*
> > > -		 * Bail out if someone want to register a new shrinker to
> > > -		 * prevent the registration from being stalled for long periods
> > > -		 * by parallel ongoing shrinking.
> > > -		 */
> > > -		if (rwsem_is_contended(&shrinker_rwsem)) {
> > > -			freed = freed ? : 1;
> > > -			break;
> > > -		}
> > >  	}
> > >  
> > > -	up_read(&shrinker_rwsem);
> > > -out:
> > > +	srcu_read_unlock(&shrinker_srcu, srcu_idx);
> > >  	cond_resched();
> > >  	return freed;
> > >  }
> > > -- 
> > > 2.20.1
> > > 
> > > 
> > 
> > Hi Qi,
> > 
> > A different problem I realized after my old attempt to use SRCU was that the
> > unregister_shrinker() path became quite slow due to the heavy synchronize_srcu()
> > call. Both register_shrinker() *and* unregister_shrinker() are called frequently
> > these days, and SRCU is too unfair to the unregister path IMO.
> > 
> > Although I never got around to submitting it, I made a non-SRCU solution [1]
> > that uses fine-grained locking instead, which is fair to both the register path
> > and unregister path. (The patch I've linked is a version of this adapted to an
> > older 4.14 kernel FYI, but it can be reworked for the current kernel.)
> > 
> > What do you think about the fine-grained locking approach?
> 
> Another approach is to use synchronize_srcu_expedited(), which avoids
> the sleeps that are otherwise used to encourage sharing of grace periods
> among concurrent requests.  It might be possible to use call_srcu(),
> but I don't claim to know the shrinker code well enough to say for sure.

Hi Paul,

I don't believe call_srcu() can be used since shrinker users need to be
guaranteed that their shrinkers aren't in use after unregister_shrinker().

Using synchronize_srcu_expedited() sounds like it'd definitely help, though
unregistering a single shrinker would ultimately still require waiting for all
shrinkers to finish running before the grace period can elapse. There can be
many shrinkers and they're not very fast I think.

Thanks,
Sultan

> 
> 							Thanx, Paul
> 
> > Thanks,
> > Sultan
> > 
> > [1] https://github.com/kerneltoast/android_kernel_google_floral/commit/012378f3173a82d2333d3ae7326691544301e76a
>
Qi Zheng Feb. 24, 2023, 4 a.m. UTC | #6
On 2023/2/24 02:24, Sultan Alsawaf wrote:
> On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
>> The shrinker_rwsem is a global lock in shrinkers subsystem,
>> it is easy to cause blocking in the following cases:
>>
>> a. the write lock of shrinker_rwsem was held for too long.
>>     For example, there are many memcgs in the system, which
>>     causes some paths to hold locks and traverse it for too
>>     long. (e.g. expand_shrinker_info())
>> b. the read lock of shrinker_rwsem was held for too long,
>>     and a writer came at this time. Then this writer will be
>>     forced to wait and block all subsequent readers.
>>     For example:
>>     - be scheduled when the read lock of shrinker_rwsem is
>>       held in do_shrink_slab()
>>     - some shrinker are blocked for too long. Like the case
>>       mentioned in the patchset[1].
>>
>> Therefore, many times in history ([2],[3],[4],[5]), some
>> people wanted to replace shrinker_rwsem reader with SRCU,
>> but they all gave up because SRCU was not unconditionally
>> enabled.
>>
>> But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
>> the SRCU is unconditionally enabled. So it's time to use
>> SRCU to protect readers who previously held shrinker_rwsem.
>>
>> [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
>> [2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
>> [3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
>> [4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
>> [5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
>>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> ---
>>   mm/vmscan.c | 27 +++++++++++----------------
>>   1 file changed, 11 insertions(+), 16 deletions(-)
>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 9f895ca6216c..02987a6f95d1 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>   
>>   LIST_HEAD(shrinker_list);
>>   DECLARE_RWSEM(shrinker_rwsem);
>> +DEFINE_SRCU(shrinker_srcu);
>>   
>>   #ifdef CONFIG_MEMCG
>>   static int shrinker_nr_max;
>> @@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
>>   void register_shrinker_prepared(struct shrinker *shrinker)
>>   {
>>   	down_write(&shrinker_rwsem);
>> -	list_add_tail(&shrinker->list, &shrinker_list);
>> +	list_add_tail_rcu(&shrinker->list, &shrinker_list);
>>   	shrinker->flags |= SHRINKER_REGISTERED;
>>   	shrinker_debugfs_add(shrinker);
>>   	up_write(&shrinker_rwsem);
>> @@ -760,13 +761,15 @@ void unregister_shrinker(struct shrinker *shrinker)
>>   		return;
>>   
>>   	down_write(&shrinker_rwsem);
>> -	list_del(&shrinker->list);
>> +	list_del_rcu(&shrinker->list);
>>   	shrinker->flags &= ~SHRINKER_REGISTERED;
>>   	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
>>   		unregister_memcg_shrinker(shrinker);
>>   	debugfs_entry = shrinker_debugfs_remove(shrinker);
>>   	up_write(&shrinker_rwsem);
>>   
>> +	synchronize_srcu(&shrinker_srcu);
>> +
>>   	debugfs_remove_recursive(debugfs_entry);
>>   
>>   	kfree(shrinker->nr_deferred);
>> @@ -786,6 +789,7 @@ void synchronize_shrinkers(void)
>>   {
>>   	down_write(&shrinker_rwsem);
>>   	up_write(&shrinker_rwsem);
>> +	synchronize_srcu(&shrinker_srcu);
>>   }
>>   EXPORT_SYMBOL(synchronize_shrinkers);
>>   
>> @@ -996,6 +1000,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>   {
>>   	unsigned long ret, freed = 0;
>>   	struct shrinker *shrinker;
>> +	int srcu_idx;
>>   
>>   	/*
>>   	 * The root memcg might be allocated even though memcg is disabled
>> @@ -1007,10 +1012,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>   	if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>>   		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>   
>> -	if (!down_read_trylock(&shrinker_rwsem))
>> -		goto out;
>> +	srcu_idx = srcu_read_lock(&shrinker_srcu);
>>   
>> -	list_for_each_entry(shrinker, &shrinker_list, list) {
>> +	list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>> +				 srcu_read_lock_held(&shrinker_srcu)) {
>>   		struct shrink_control sc = {
>>   			.gfp_mask = gfp_mask,
>>   			.nid = nid,
>> @@ -1021,19 +1026,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>   		if (ret == SHRINK_EMPTY)
>>   			ret = 0;
>>   		freed += ret;
>> -		/*
>> -		 * Bail out if someone want to register a new shrinker to
>> -		 * prevent the registration from being stalled for long periods
>> -		 * by parallel ongoing shrinking.
>> -		 */
>> -		if (rwsem_is_contended(&shrinker_rwsem)) {
>> -			freed = freed ? : 1;
>> -			break;
>> -		}
>>   	}
>>   
>> -	up_read(&shrinker_rwsem);
>> -out:
>> +	srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>   	cond_resched();
>>   	return freed;
>>   }
>> -- 
>> 2.20.1
>>
>>
> 
> Hi Qi,
> 
> A different problem I realized after my old attempt to use SRCU was that the
> unregister_shrinker() path became quite slow due to the heavy synchronize_srcu()
> call. Both register_shrinker() *and* unregister_shrinker() are called frequently
> these days, and SRCU is too unfair to the unregister path IMO.

Hi Sultan,

IIUC, for unregister_shrinker(), the wait time is hardly longer with
SRCU than with shrinker_rwsem before.

And I just did a simple test. After using the script in cover letter to
increase the shrink_slab hotspot, I did umount 1k times at the same
time, and then I used bpftrace to measure the time consumption of
unregister_shrinker() as follows:

bpftrace -e 'kprobe:unregister_shrinker { @start[tid] = nsecs; } 
kretprobe:unregister_shrinker /@start[tid]/ { @ns[comm] = hist(nsecs - 
@start[tid]); delete(@start[tid]); }'

@ns[umount]:
[16K, 32K)             3 | 
      |
[32K, 64K)            66 |@@@@@@@@@@ 
      |
[64K, 128K)           32 |@@@@@ 
      |
[128K, 256K)          22 |@@@ 
      |
[256K, 512K)          48 |@@@@@@@ 
      |
[512K, 1M)            19 |@@@ 
      |
[1M, 2M)             131 |@@@@@@@@@@@@@@@@@@@@@ 
      |
[2M, 4M)             313 
|@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[4M, 8M)             302 
|@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  |
[8M, 16M)             55 |@@@@@@@@@

I see that the highest time-consuming of unregister_shrinker() is 
between 8ms and 16ms, which feels tolerable?

Thanks,
Qi

> 
> Although I never got around to submitting it, I made a non-SRCU solution [1]
> that uses fine-grained locking instead, which is fair to both the register path
> and unregister path. (The patch I've linked is a version of this adapted to an
> older 4.14 kernel FYI, but it can be reworked for the current kernel.)
> 
> What do you think about the fine-grained locking approach?
> 
> Thanks,
> Sultan
> 
> [1] https://github.com/kerneltoast/android_kernel_google_floral/commit/012378f3173a82d2333d3ae7326691544301e76a
>
Qi Zheng Feb. 24, 2023, 4:09 a.m. UTC | #7
On 2023/2/23 23:37, Rafael Aquini wrote:
> On Thu, Feb 23, 2023 at 10:26:45AM -0500, Rafael Aquini wrote:
>> On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
>>> The shrinker_rwsem is a global lock in shrinkers subsystem,
>>> it is easy to cause blocking in the following cases:
>>>
>>> a. the write lock of shrinker_rwsem was held for too long.
>>>     For example, there are many memcgs in the system, which
>>>     causes some paths to hold locks and traverse it for too
>>>     long. (e.g. expand_shrinker_info())
>>> b. the read lock of shrinker_rwsem was held for too long,
>>>     and a writer came at this time. Then this writer will be
>>>     forced to wait and block all subsequent readers.
>>>     For example:
>>>     - be scheduled when the read lock of shrinker_rwsem is
>>>       held in do_shrink_slab()
>>>     - some shrinker are blocked for too long. Like the case
>>>       mentioned in the patchset[1].
>>>
>>> Therefore, many times in history ([2],[3],[4],[5]), some
>>> people wanted to replace shrinker_rwsem reader with SRCU,
>>> but they all gave up because SRCU was not unconditionally
>>> enabled.
>>>
>>> But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
>>> the SRCU is unconditionally enabled. So it's time to use
>>> SRCU to protect readers who previously held shrinker_rwsem.
>>>
>>> [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
>>> [2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
>>> [3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
>>> [4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
>>> [5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
>>>
>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>> ---
>>>   mm/vmscan.c | 27 +++++++++++----------------
>>>   1 file changed, 11 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>> index 9f895ca6216c..02987a6f95d1 100644
>>> --- a/mm/vmscan.c
>>> +++ b/mm/vmscan.c
>>> @@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>   
>>>   LIST_HEAD(shrinker_list);
>>>   DECLARE_RWSEM(shrinker_rwsem);
>>> +DEFINE_SRCU(shrinker_srcu);
>>>   
>>>   #ifdef CONFIG_MEMCG
>>>   static int shrinker_nr_max;
>>> @@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
>>>   void register_shrinker_prepared(struct shrinker *shrinker)
>>>   {
>>>   	down_write(&shrinker_rwsem);
>>
>> I think you could revert the rwsem back to a simple mutex, now.
>>
> 
> NVM, that's exactly what patch 7 does. :)

Yeah. :)

> 
>   
>
Qi Zheng Feb. 24, 2023, 4:16 a.m. UTC | #8
On 2023/2/24 12:00, Qi Zheng wrote:
> 
> 
> On 2023/2/24 02:24, Sultan Alsawaf wrote:
>> On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
>>> The shrinker_rwsem is a global lock in shrinkers subsystem,
>>> it is easy to cause blocking in the following cases:
>>>
>>> a. the write lock of shrinker_rwsem was held for too long.
>>>     For example, there are many memcgs in the system, which
>>>     causes some paths to hold locks and traverse it for too
>>>     long. (e.g. expand_shrinker_info())
>>> b. the read lock of shrinker_rwsem was held for too long,
>>>     and a writer came at this time. Then this writer will be
>>>     forced to wait and block all subsequent readers.
>>>     For example:
>>>     - be scheduled when the read lock of shrinker_rwsem is
>>>       held in do_shrink_slab()
>>>     - some shrinker are blocked for too long. Like the case
>>>       mentioned in the patchset[1].
>>>
>>> Therefore, many times in history ([2],[3],[4],[5]), some
>>> people wanted to replace shrinker_rwsem reader with SRCU,
>>> but they all gave up because SRCU was not unconditionally
>>> enabled.
>>>
>>> But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
>>> the SRCU is unconditionally enabled. So it's time to use
>>> SRCU to protect readers who previously held shrinker_rwsem.
>>>
>>> [1]. 
>>> https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
>>> [2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
>>> [3]. 
>>> https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
>>> [4]. 
>>> https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
>>> [5]. 
>>> https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
>>>
>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>> ---
>>>   mm/vmscan.c | 27 +++++++++++----------------
>>>   1 file changed, 11 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>> index 9f895ca6216c..02987a6f95d1 100644
>>> --- a/mm/vmscan.c
>>> +++ b/mm/vmscan.c
>>> @@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct 
>>> task_struct *task,
>>>   LIST_HEAD(shrinker_list);
>>>   DECLARE_RWSEM(shrinker_rwsem);
>>> +DEFINE_SRCU(shrinker_srcu);
>>>   #ifdef CONFIG_MEMCG
>>>   static int shrinker_nr_max;
>>> @@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker 
>>> *shrinker)
>>>   void register_shrinker_prepared(struct shrinker *shrinker)
>>>   {
>>>       down_write(&shrinker_rwsem);
>>> -    list_add_tail(&shrinker->list, &shrinker_list);
>>> +    list_add_tail_rcu(&shrinker->list, &shrinker_list);
>>>       shrinker->flags |= SHRINKER_REGISTERED;
>>>       shrinker_debugfs_add(shrinker);
>>>       up_write(&shrinker_rwsem);
>>> @@ -760,13 +761,15 @@ void unregister_shrinker(struct shrinker 
>>> *shrinker)
>>>           return;
>>>       down_write(&shrinker_rwsem);
>>> -    list_del(&shrinker->list);
>>> +    list_del_rcu(&shrinker->list);
>>>       shrinker->flags &= ~SHRINKER_REGISTERED;
>>>       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
>>>           unregister_memcg_shrinker(shrinker);
>>>       debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>       up_write(&shrinker_rwsem);
>>> +    synchronize_srcu(&shrinker_srcu);
>>> +
>>>       debugfs_remove_recursive(debugfs_entry);
>>>       kfree(shrinker->nr_deferred);
>>> @@ -786,6 +789,7 @@ void synchronize_shrinkers(void)
>>>   {
>>>       down_write(&shrinker_rwsem);
>>>       up_write(&shrinker_rwsem);
>>> +    synchronize_srcu(&shrinker_srcu);
>>>   }
>>>   EXPORT_SYMBOL(synchronize_shrinkers);
>>> @@ -996,6 +1000,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, 
>>> int nid,
>>>   {
>>>       unsigned long ret, freed = 0;
>>>       struct shrinker *shrinker;
>>> +    int srcu_idx;
>>>       /*
>>>        * The root memcg might be allocated even though memcg is disabled
>>> @@ -1007,10 +1012,10 @@ static unsigned long shrink_slab(gfp_t 
>>> gfp_mask, int nid,
>>>       if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>>>           return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>> -    if (!down_read_trylock(&shrinker_rwsem))
>>> -        goto out;
>>> +    srcu_idx = srcu_read_lock(&shrinker_srcu);
>>> -    list_for_each_entry(shrinker, &shrinker_list, list) {
>>> +    list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>> +                 srcu_read_lock_held(&shrinker_srcu)) {
>>>           struct shrink_control sc = {
>>>               .gfp_mask = gfp_mask,
>>>               .nid = nid,
>>> @@ -1021,19 +1026,9 @@ static unsigned long shrink_slab(gfp_t 
>>> gfp_mask, int nid,
>>>           if (ret == SHRINK_EMPTY)
>>>               ret = 0;
>>>           freed += ret;
>>> -        /*
>>> -         * Bail out if someone want to register a new shrinker to
>>> -         * prevent the registration from being stalled for long periods
>>> -         * by parallel ongoing shrinking.
>>> -         */
>>> -        if (rwsem_is_contended(&shrinker_rwsem)) {
>>> -            freed = freed ? : 1;
>>> -            break;
>>> -        }
>>>       }
>>> -    up_read(&shrinker_rwsem);
>>> -out:
>>> +    srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>       cond_resched();
>>>       return freed;
>>>   }
>>> -- 
>>> 2.20.1
>>>
>>>
>>
>> Hi Qi,
>>
>> A different problem I realized after my old attempt to use SRCU was 
>> that the
>> unregister_shrinker() path became quite slow due to the heavy 
>> synchronize_srcu()
>> call. Both register_shrinker() *and* unregister_shrinker() are called 
>> frequently
>> these days, and SRCU is too unfair to the unregister path IMO.
> 
> Hi Sultan,
> 
> IIUC, for unregister_shrinker(), the wait time is hardly longer with
> SRCU than with shrinker_rwsem before.
> 
> And I just did a simple test. After using the script in cover letter to
> increase the shrink_slab hotspot, I did umount 1k times at the same
> time, and then I used bpftrace to measure the time consumption of
> unregister_shrinker() as follows:
> 
> bpftrace -e 'kprobe:unregister_shrinker { @start[tid] = nsecs; } 
> kretprobe:unregister_shrinker /@start[tid]/ { @ns[comm] = hist(nsecs - 
> @start[tid]); delete(@start[tid]); }'
> 
> @ns[umount]:
> [16K, 32K)             3 |      |
> [32K, 64K)            66 |@@@@@@@@@@      |
> [64K, 128K)           32 |@@@@@      |
> [128K, 256K)          22 |@@@      |
> [256K, 512K)          48 |@@@@@@@      |
> [512K, 1M)            19 |@@@      |
> [1M, 2M)             131 |@@@@@@@@@@@@@@@@@@@@@      |
> [2M, 4M)             313 
> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [4M, 8M)             302 
> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  |
> [8M, 16M)             55 |@@@@@@@@@
> 
> I see that the highest time-consuming of unregister_shrinker() is 
> between 8ms and 16ms, which feels tolerable?

And when I use the synchronize_srcu_expedited() mentioned by Paul,
the measured time consumption has a more obvious decrease:

@ns[umount]:
[16K, 32K)           105 |@@@@@@@@@@ 
      |
[32K, 64K)           521 
|@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[64K, 128K)          119 |@@@@@@@@@@@ 
      |
[128K, 256K)          32 |@@@ 
      |
[256K, 512K)          70 |@@@@@@ 
      |
[512K, 1M)            49 |@@@@ 
      |
[1M, 2M)              34 |@@@ 
      |
[2M, 4M)              18 |@ 
      |
[4M, 8M)               4 |

> 
> Thanks,
> Qi
> 
>>
>> Although I never got around to submitting it, I made a non-SRCU 
>> solution [1]
>> that uses fine-grained locking instead, which is fair to both the 
>> register path
>> and unregister path. (The patch I've linked is a version of this 
>> adapted to an
>> older 4.14 kernel FYI, but it can be reworked for the current kernel.)
>>
>> What do you think about the fine-grained locking approach?
>>
>> Thanks,
>> Sultan
>>
>> [1] 
>> https://github.com/kerneltoast/android_kernel_google_floral/commit/012378f3173a82d2333d3ae7326691544301e76a
>>
>
Sultan Alsawaf (unemployed) Feb. 24, 2023, 8:20 a.m. UTC | #9
On Fri, Feb 24, 2023 at 12:00:21PM +0800, Qi Zheng wrote:
> 
> 
> On 2023/2/24 02:24, Sultan Alsawaf wrote:
> > On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
> > > The shrinker_rwsem is a global lock in shrinkers subsystem,
> > > it is easy to cause blocking in the following cases:
> > > 
> > > a. the write lock of shrinker_rwsem was held for too long.
> > >     For example, there are many memcgs in the system, which
> > >     causes some paths to hold locks and traverse it for too
> > >     long. (e.g. expand_shrinker_info())
> > > b. the read lock of shrinker_rwsem was held for too long,
> > >     and a writer came at this time. Then this writer will be
> > >     forced to wait and block all subsequent readers.
> > >     For example:
> > >     - be scheduled when the read lock of shrinker_rwsem is
> > >       held in do_shrink_slab()
> > >     - some shrinker are blocked for too long. Like the case
> > >       mentioned in the patchset[1].
> > > 
> > > Therefore, many times in history ([2],[3],[4],[5]), some
> > > people wanted to replace shrinker_rwsem reader with SRCU,
> > > but they all gave up because SRCU was not unconditionally
> > > enabled.
> > > 
> > > But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
> > > the SRCU is unconditionally enabled. So it's time to use
> > > SRCU to protect readers who previously held shrinker_rwsem.
> > > 
> > > [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
> > > [2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
> > > [3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
> > > [4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
> > > [5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
> > > 
> > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> > > ---
> > >   mm/vmscan.c | 27 +++++++++++----------------
> > >   1 file changed, 11 insertions(+), 16 deletions(-)
> > > 
> > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > index 9f895ca6216c..02987a6f95d1 100644
> > > --- a/mm/vmscan.c
> > > +++ b/mm/vmscan.c
> > > @@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
> > >   LIST_HEAD(shrinker_list);
> > >   DECLARE_RWSEM(shrinker_rwsem);
> > > +DEFINE_SRCU(shrinker_srcu);
> > >   #ifdef CONFIG_MEMCG
> > >   static int shrinker_nr_max;
> > > @@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
> > >   void register_shrinker_prepared(struct shrinker *shrinker)
> > >   {
> > >   	down_write(&shrinker_rwsem);
> > > -	list_add_tail(&shrinker->list, &shrinker_list);
> > > +	list_add_tail_rcu(&shrinker->list, &shrinker_list);
> > >   	shrinker->flags |= SHRINKER_REGISTERED;
> > >   	shrinker_debugfs_add(shrinker);
> > >   	up_write(&shrinker_rwsem);
> > > @@ -760,13 +761,15 @@ void unregister_shrinker(struct shrinker *shrinker)
> > >   		return;
> > >   	down_write(&shrinker_rwsem);
> > > -	list_del(&shrinker->list);
> > > +	list_del_rcu(&shrinker->list);
> > >   	shrinker->flags &= ~SHRINKER_REGISTERED;
> > >   	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
> > >   		unregister_memcg_shrinker(shrinker);
> > >   	debugfs_entry = shrinker_debugfs_remove(shrinker);
> > >   	up_write(&shrinker_rwsem);
> > > +	synchronize_srcu(&shrinker_srcu);
> > > +
> > >   	debugfs_remove_recursive(debugfs_entry);
> > >   	kfree(shrinker->nr_deferred);
> > > @@ -786,6 +789,7 @@ void synchronize_shrinkers(void)
> > >   {
> > >   	down_write(&shrinker_rwsem);
> > >   	up_write(&shrinker_rwsem);
> > > +	synchronize_srcu(&shrinker_srcu);
> > >   }
> > >   EXPORT_SYMBOL(synchronize_shrinkers);
> > > @@ -996,6 +1000,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
> > >   {
> > >   	unsigned long ret, freed = 0;
> > >   	struct shrinker *shrinker;
> > > +	int srcu_idx;
> > >   	/*
> > >   	 * The root memcg might be allocated even though memcg is disabled
> > > @@ -1007,10 +1012,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
> > >   	if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
> > >   		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
> > > -	if (!down_read_trylock(&shrinker_rwsem))
> > > -		goto out;
> > > +	srcu_idx = srcu_read_lock(&shrinker_srcu);
> > > -	list_for_each_entry(shrinker, &shrinker_list, list) {
> > > +	list_for_each_entry_srcu(shrinker, &shrinker_list, list,
> > > +				 srcu_read_lock_held(&shrinker_srcu)) {
> > >   		struct shrink_control sc = {
> > >   			.gfp_mask = gfp_mask,
> > >   			.nid = nid,
> > > @@ -1021,19 +1026,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
> > >   		if (ret == SHRINK_EMPTY)
> > >   			ret = 0;
> > >   		freed += ret;
> > > -		/*
> > > -		 * Bail out if someone want to register a new shrinker to
> > > -		 * prevent the registration from being stalled for long periods
> > > -		 * by parallel ongoing shrinking.
> > > -		 */
> > > -		if (rwsem_is_contended(&shrinker_rwsem)) {
> > > -			freed = freed ? : 1;
> > > -			break;
> > > -		}
> > >   	}
> > > -	up_read(&shrinker_rwsem);
> > > -out:
> > > +	srcu_read_unlock(&shrinker_srcu, srcu_idx);
> > >   	cond_resched();
> > >   	return freed;
> > >   }
> > > -- 
> > > 2.20.1
> > > 
> > > 
> > 
> > Hi Qi,
> > 
> > A different problem I realized after my old attempt to use SRCU was that the
> > unregister_shrinker() path became quite slow due to the heavy synchronize_srcu()
> > call. Both register_shrinker() *and* unregister_shrinker() are called frequently
> > these days, and SRCU is too unfair to the unregister path IMO.
> 
> Hi Sultan,
> 
> IIUC, for unregister_shrinker(), the wait time is hardly longer with
> SRCU than with shrinker_rwsem before.

The wait time can be quite different because with shrinker_rwsem, the
rwsem_is_contended() bailout would cause unregister_shrinker() to wait for only
one random shrinker to finish at worst rather than waiting for *all* shrinkers
to finish.

> And I just did a simple test. After using the script in cover letter to
> increase the shrink_slab hotspot, I did umount 1k times at the same
> time, and then I used bpftrace to measure the time consumption of
> unregister_shrinker() as follows:
> 
> bpftrace -e 'kprobe:unregister_shrinker { @start[tid] = nsecs; }
> kretprobe:unregister_shrinker /@start[tid]/ { @ns[comm] = hist(nsecs -
> @start[tid]); delete(@start[tid]); }'
> 
> @ns[umount]:
> [16K, 32K)             3 |      |
> [32K, 64K)            66 |@@@@@@@@@@      |
> [64K, 128K)           32 |@@@@@      |
> [128K, 256K)          22 |@@@      |
> [256K, 512K)          48 |@@@@@@@      |
> [512K, 1M)            19 |@@@      |
> [1M, 2M)             131 |@@@@@@@@@@@@@@@@@@@@@      |
> [2M, 4M)             313
> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [4M, 8M)             302 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
> |
> [8M, 16M)             55 |@@@@@@@@@
> 
> I see that the highest time-consuming of unregister_shrinker() is between
> 8ms and 16ms, which feels tolerable?

If you've got a fast x86 machine then I'd say that's a bit slow. :)

This depends a lot on which shrinkers are active on your system and how much
work each one does upon running. If a driver's shrinker doesn't have much to do
because there's nothing it can shrink further, then it'll run fast. Conversely,
if a driver is stressed in a way that constantly creates a lot of potential work
for its shrinker, then the shrinker will run longer.

Since shrinkers are allowed to sleep, the delays can really add up when waiting
for all of them to finish running. In the past, I recall observing delays of
100ms+ in unregister_shrinker() on slower arm64 hardware when I stress tested
the SRCU approach.

If your GPU driver has a shrinker (such as i915), I suggest testing again under
heavy GPU load. The GPU shrinkers can be pretty heavy IIRC.

Thanks,
Sultan

> Thanks,
> Qi
> 
> > 
> > Although I never got around to submitting it, I made a non-SRCU solution [1]
> > that uses fine-grained locking instead, which is fair to both the register path
> > and unregister path. (The patch I've linked is a version of this adapted to an
> > older 4.14 kernel FYI, but it can be reworked for the current kernel.)
> > 
> > What do you think about the fine-grained locking approach?
> > 
> > Thanks,
> > Sultan
> > 
> > [1] https://github.com/kerneltoast/android_kernel_google_floral/commit/012378f3173a82d2333d3ae7326691544301e76a
> > 
> 
> -- 
> Thanks,
> Qi
Qi Zheng Feb. 24, 2023, 10:12 a.m. UTC | #10
On 2023/2/24 16:20, Sultan Alsawaf wrote:
> On Fri, Feb 24, 2023 at 12:00:21PM +0800, Qi Zheng wrote:
>>
>>
>> On 2023/2/24 02:24, Sultan Alsawaf wrote:
>>> On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
>>>> The shrinker_rwsem is a global lock in shrinkers subsystem,
>>>> it is easy to cause blocking in the following cases:
>>>>
>>>> a. the write lock of shrinker_rwsem was held for too long.
>>>>      For example, there are many memcgs in the system, which
>>>>      causes some paths to hold locks and traverse it for too
>>>>      long. (e.g. expand_shrinker_info())
>>>> b. the read lock of shrinker_rwsem was held for too long,
>>>>      and a writer came at this time. Then this writer will be
>>>>      forced to wait and block all subsequent readers.
>>>>      For example:
>>>>      - be scheduled when the read lock of shrinker_rwsem is
>>>>        held in do_shrink_slab()
>>>>      - some shrinker are blocked for too long. Like the case
>>>>        mentioned in the patchset[1].
>>>>
>>>> Therefore, many times in history ([2],[3],[4],[5]), some
>>>> people wanted to replace shrinker_rwsem reader with SRCU,
>>>> but they all gave up because SRCU was not unconditionally
>>>> enabled.
>>>>
>>>> But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
>>>> the SRCU is unconditionally enabled. So it's time to use
>>>> SRCU to protect readers who previously held shrinker_rwsem.
>>>>
>>>> [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
>>>> [2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
>>>> [3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
>>>> [4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
>>>> [5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
>>>>
>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>> ---
>>>>    mm/vmscan.c | 27 +++++++++++----------------
>>>>    1 file changed, 11 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>> index 9f895ca6216c..02987a6f95d1 100644
>>>> --- a/mm/vmscan.c
>>>> +++ b/mm/vmscan.c
>>>> @@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>>    LIST_HEAD(shrinker_list);
>>>>    DECLARE_RWSEM(shrinker_rwsem);
>>>> +DEFINE_SRCU(shrinker_srcu);
>>>>    #ifdef CONFIG_MEMCG
>>>>    static int shrinker_nr_max;
>>>> @@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
>>>>    void register_shrinker_prepared(struct shrinker *shrinker)
>>>>    {
>>>>    	down_write(&shrinker_rwsem);
>>>> -	list_add_tail(&shrinker->list, &shrinker_list);
>>>> +	list_add_tail_rcu(&shrinker->list, &shrinker_list);
>>>>    	shrinker->flags |= SHRINKER_REGISTERED;
>>>>    	shrinker_debugfs_add(shrinker);
>>>>    	up_write(&shrinker_rwsem);
>>>> @@ -760,13 +761,15 @@ void unregister_shrinker(struct shrinker *shrinker)
>>>>    		return;
>>>>    	down_write(&shrinker_rwsem);
>>>> -	list_del(&shrinker->list);
>>>> +	list_del_rcu(&shrinker->list);
>>>>    	shrinker->flags &= ~SHRINKER_REGISTERED;
>>>>    	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
>>>>    		unregister_memcg_shrinker(shrinker);
>>>>    	debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>>    	up_write(&shrinker_rwsem);
>>>> +	synchronize_srcu(&shrinker_srcu);
>>>> +
>>>>    	debugfs_remove_recursive(debugfs_entry);
>>>>    	kfree(shrinker->nr_deferred);
>>>> @@ -786,6 +789,7 @@ void synchronize_shrinkers(void)
>>>>    {
>>>>    	down_write(&shrinker_rwsem);
>>>>    	up_write(&shrinker_rwsem);
>>>> +	synchronize_srcu(&shrinker_srcu);
>>>>    }
>>>>    EXPORT_SYMBOL(synchronize_shrinkers);
>>>> @@ -996,6 +1000,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>    {
>>>>    	unsigned long ret, freed = 0;
>>>>    	struct shrinker *shrinker;
>>>> +	int srcu_idx;
>>>>    	/*
>>>>    	 * The root memcg might be allocated even though memcg is disabled
>>>> @@ -1007,10 +1012,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>    	if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>>>>    		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>> -	if (!down_read_trylock(&shrinker_rwsem))
>>>> -		goto out;
>>>> +	srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>> -	list_for_each_entry(shrinker, &shrinker_list, list) {
>>>> +	list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>> +				 srcu_read_lock_held(&shrinker_srcu)) {
>>>>    		struct shrink_control sc = {
>>>>    			.gfp_mask = gfp_mask,
>>>>    			.nid = nid,
>>>> @@ -1021,19 +1026,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>    		if (ret == SHRINK_EMPTY)
>>>>    			ret = 0;
>>>>    		freed += ret;
>>>> -		/*
>>>> -		 * Bail out if someone want to register a new shrinker to
>>>> -		 * prevent the registration from being stalled for long periods
>>>> -		 * by parallel ongoing shrinking.
>>>> -		 */
>>>> -		if (rwsem_is_contended(&shrinker_rwsem)) {
>>>> -			freed = freed ? : 1;
>>>> -			break;
>>>> -		}
>>>>    	}
>>>> -	up_read(&shrinker_rwsem);
>>>> -out:
>>>> +	srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>    	cond_resched();
>>>>    	return freed;
>>>>    }
>>>> -- 
>>>> 2.20.1
>>>>
>>>>
>>>
>>> Hi Qi,
>>>
>>> A different problem I realized after my old attempt to use SRCU was that the
>>> unregister_shrinker() path became quite slow due to the heavy synchronize_srcu()
>>> call. Both register_shrinker() *and* unregister_shrinker() are called frequently
>>> these days, and SRCU is too unfair to the unregister path IMO.
>>
>> Hi Sultan,
>>
>> IIUC, for unregister_shrinker(), the wait time is hardly longer with
>> SRCU than with shrinker_rwsem before.
> 
> The wait time can be quite different because with shrinker_rwsem, the
> rwsem_is_contended() bailout would cause unregister_shrinker() to wait for only
> one random shrinker to finish at worst rather than waiting for *all* shrinkers
> to finish.

Yes, to be exact, unregister_shrinker() needs to wait for all the
shrinkers who entered grace period before it. But the benefit in
exchange is that the slab shrink is completely lock-free, I think this 
is more worthwhile than letting unregister_shrinker() wait a little
longer.

> 
>> And I just did a simple test. After using the script in cover letter to
>> increase the shrink_slab hotspot, I did umount 1k times at the same
>> time, and then I used bpftrace to measure the time consumption of
>> unregister_shrinker() as follows:
>>
>> bpftrace -e 'kprobe:unregister_shrinker { @start[tid] = nsecs; }
>> kretprobe:unregister_shrinker /@start[tid]/ { @ns[comm] = hist(nsecs -
>> @start[tid]); delete(@start[tid]); }'
>>
>> @ns[umount]:
>> [16K, 32K)             3 |      |
>> [32K, 64K)            66 |@@@@@@@@@@      |
>> [64K, 128K)           32 |@@@@@      |
>> [128K, 256K)          22 |@@@      |
>> [256K, 512K)          48 |@@@@@@@      |
>> [512K, 1M)            19 |@@@      |
>> [1M, 2M)             131 |@@@@@@@@@@@@@@@@@@@@@      |
>> [2M, 4M)             313
>> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>> [4M, 8M)             302 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
>> |
>> [8M, 16M)             55 |@@@@@@@@@
>>
>> I see that the highest time-consuming of unregister_shrinker() is between
>> 8ms and 16ms, which feels tolerable?
> 
> If you've got a fast x86 machine then I'd say that's a bit slow. :)

Nope, I tested it on a qemu virtual machine.

And I just tested it on a physical machine (Intel(R) Xeon(R) Platinum
8260 CPU @ 2.40GHz) and the results are as follows:

1) use synchronize_srcu():

@ns[umount]:
[8K, 16K)             83 |@@@@@@@ 
      |
[16K, 32K)           578 
|@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[32K, 64K)            78 |@@@@@@@ 
      |
[64K, 128K)            6 | 
      |
[128K, 256K)           7 | 
      |
[256K, 512K)          29 |@@ 
      |
[512K, 1M)            51 |@@@@ 
      |
[1M, 2M)              90 |@@@@@@@@ 
      |
[2M, 4M)              70 |@@@@@@ 
      |
[4M, 8M)               8 | 
      |

2) use synchronize_srcu_expedited():

@ns[umount]:
[8K, 16K)             31 |@@ 
      |
[16K, 32K)           803 
|@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[32K, 64K)           158 |@@@@@@@@@@ 
      |
[64K, 128K)            4 | 
      |
[128K, 256K)           2 | 
      |
[256K, 512K)           2 | 
      |

Thanks,
Qi

> 
> This depends a lot on which shrinkers are active on your system and how much
> work each one does upon running. If a driver's shrinker doesn't have much to do
> because there's nothing it can shrink further, then it'll run fast. Conversely,
> if a driver is stressed in a way that constantly creates a lot of potential work
> for its shrinker, then the shrinker will run longer.
> 
> Since shrinkers are allowed to sleep, the delays can really add up when waiting
> for all of them to finish running. In the past, I recall observing delays of
> 100ms+ in unregister_shrinker() on slower arm64 hardware when I stress tested
> the SRCU approach.
> 
> If your GPU driver has a shrinker (such as i915), I suggest testing again under
> heavy GPU load. The GPU shrinkers can be pretty heavy IIRC.
> 
> Thanks,
> Sultan
> 
>> Thanks,
>> Qi
>>
>>>
>>> Although I never got around to submitting it, I made a non-SRCU solution [1]
>>> that uses fine-grained locking instead, which is fair to both the register path
>>> and unregister path. (The patch I've linked is a version of this adapted to an
>>> older 4.14 kernel FYI, but it can be reworked for the current kernel.)
>>>
>>> What do you think about the fine-grained locking approach?
>>>
>>> Thanks,
>>> Sultan
>>>
>>> [1] https://github.com/kerneltoast/android_kernel_google_floral/commit/012378f3173a82d2333d3ae7326691544301e76a
>>>
>>
>> -- 
>> Thanks,
>> Qi
Kirill Tkhai Feb. 24, 2023, 9:02 p.m. UTC | #11
On 24.02.2023 07:00, Qi Zheng wrote:
> 
> 
> On 2023/2/24 02:24, Sultan Alsawaf wrote:
>> On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
>>> The shrinker_rwsem is a global lock in shrinkers subsystem,
>>> it is easy to cause blocking in the following cases:
>>>
>>> a. the write lock of shrinker_rwsem was held for too long.
>>>     For example, there are many memcgs in the system, which
>>>     causes some paths to hold locks and traverse it for too
>>>     long. (e.g. expand_shrinker_info())
>>> b. the read lock of shrinker_rwsem was held for too long,
>>>     and a writer came at this time. Then this writer will be
>>>     forced to wait and block all subsequent readers.
>>>     For example:
>>>     - be scheduled when the read lock of shrinker_rwsem is
>>>       held in do_shrink_slab()
>>>     - some shrinker are blocked for too long. Like the case
>>>       mentioned in the patchset[1].
>>>
>>> Therefore, many times in history ([2],[3],[4],[5]), some
>>> people wanted to replace shrinker_rwsem reader with SRCU,
>>> but they all gave up because SRCU was not unconditionally
>>> enabled.
>>>
>>> But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
>>> the SRCU is unconditionally enabled. So it's time to use
>>> SRCU to protect readers who previously held shrinker_rwsem.
>>>
>>> [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
>>> [2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
>>> [3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
>>> [4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
>>> [5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
>>>
>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>> ---
>>>   mm/vmscan.c | 27 +++++++++++----------------
>>>   1 file changed, 11 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>> index 9f895ca6216c..02987a6f95d1 100644
>>> --- a/mm/vmscan.c
>>> +++ b/mm/vmscan.c
>>> @@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>     LIST_HEAD(shrinker_list);
>>>   DECLARE_RWSEM(shrinker_rwsem);
>>> +DEFINE_SRCU(shrinker_srcu);
>>>     #ifdef CONFIG_MEMCG
>>>   static int shrinker_nr_max;
>>> @@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
>>>   void register_shrinker_prepared(struct shrinker *shrinker)
>>>   {
>>>       down_write(&shrinker_rwsem);
>>> -    list_add_tail(&shrinker->list, &shrinker_list);
>>> +    list_add_tail_rcu(&shrinker->list, &shrinker_list);
>>>       shrinker->flags |= SHRINKER_REGISTERED;
>>>       shrinker_debugfs_add(shrinker);
>>>       up_write(&shrinker_rwsem);
>>> @@ -760,13 +761,15 @@ void unregister_shrinker(struct shrinker *shrinker)
>>>           return;
>>>         down_write(&shrinker_rwsem);
>>> -    list_del(&shrinker->list);
>>> +    list_del_rcu(&shrinker->list);
>>>       shrinker->flags &= ~SHRINKER_REGISTERED;
>>>       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
>>>           unregister_memcg_shrinker(shrinker);
>>>       debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>       up_write(&shrinker_rwsem);
>>>   +    synchronize_srcu(&shrinker_srcu);
>>> +
>>>       debugfs_remove_recursive(debugfs_entry);
>>>         kfree(shrinker->nr_deferred);
>>> @@ -786,6 +789,7 @@ void synchronize_shrinkers(void)
>>>   {
>>>       down_write(&shrinker_rwsem);
>>>       up_write(&shrinker_rwsem);
>>> +    synchronize_srcu(&shrinker_srcu);
>>>   }
>>>   EXPORT_SYMBOL(synchronize_shrinkers);
>>>   @@ -996,6 +1000,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>   {
>>>       unsigned long ret, freed = 0;
>>>       struct shrinker *shrinker;
>>> +    int srcu_idx;
>>>         /*
>>>        * The root memcg might be allocated even though memcg is disabled
>>> @@ -1007,10 +1012,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>       if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>>>           return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>   -    if (!down_read_trylock(&shrinker_rwsem))
>>> -        goto out;
>>> +    srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>   -    list_for_each_entry(shrinker, &shrinker_list, list) {
>>> +    list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>> +                 srcu_read_lock_held(&shrinker_srcu)) {
>>>           struct shrink_control sc = {
>>>               .gfp_mask = gfp_mask,
>>>               .nid = nid,
>>> @@ -1021,19 +1026,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>           if (ret == SHRINK_EMPTY)
>>>               ret = 0;
>>>           freed += ret;
>>> -        /*
>>> -         * Bail out if someone want to register a new shrinker to
>>> -         * prevent the registration from being stalled for long periods
>>> -         * by parallel ongoing shrinking.
>>> -         */
>>> -        if (rwsem_is_contended(&shrinker_rwsem)) {
>>> -            freed = freed ? : 1;
>>> -            break;
>>> -        }
>>>       }
>>>   -    up_read(&shrinker_rwsem);
>>> -out:
>>> +    srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>       cond_resched();
>>>       return freed;
>>>   }
>>> -- 
>>> 2.20.1
>>>
>>>
>>
>> Hi Qi,
>>
>> A different problem I realized after my old attempt to use SRCU was that the
>> unregister_shrinker() path became quite slow due to the heavy synchronize_srcu()
>> call. Both register_shrinker() *and* unregister_shrinker() are called frequently
>> these days, and SRCU is too unfair to the unregister path IMO.
> 
> Hi Sultan,
> 
> IIUC, for unregister_shrinker(), the wait time is hardly longer with
> SRCU than with shrinker_rwsem before.
> 
> And I just did a simple test. After using the script in cover letter to
> increase the shrink_slab hotspot, I did umount 1k times at the same
> time, and then I used bpftrace to measure the time consumption of
> unregister_shrinker() as follows:
> 
> bpftrace -e 'kprobe:unregister_shrinker { @start[tid] = nsecs; } kretprobe:unregister_shrinker /@start[tid]/ { @ns[comm] = hist(nsecs - @start[tid]); delete(@start[tid]); }'
> 
> @ns[umount]:
> [16K, 32K)             3 |      |
> [32K, 64K)            66 |@@@@@@@@@@      |
> [64K, 128K)           32 |@@@@@      |
> [128K, 256K)          22 |@@@      |
> [256K, 512K)          48 |@@@@@@@      |
> [512K, 1M)            19 |@@@      |
> [1M, 2M)             131 |@@@@@@@@@@@@@@@@@@@@@      |
> [2M, 4M)             313 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [4M, 8M)             302 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  |
> [8M, 16M)             55 |@@@@@@@@@
> 
> I see that the highest time-consuming of unregister_shrinker() is between 8ms and 16ms, which feels tolerable?

The fundamental difference is that before the patchset this for_each_set_bit() iteration could be broken in the middle
of two do_shrink_slab() calls, while after the patchset we can leave for_each_set_bit() only after visiting all set bits.

Using only synchronize_srcu_expedited() won't help here.

My opinion is we should restore a check similar to the rwsem_is_contendent() check that we had before. Something like
the below on top of your patchset merged into appropriate patch:

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 27ef9946ae8a..50e7812468ec 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
 LIST_HEAD(shrinker_list);
 DEFINE_MUTEX(shrinker_mutex);
 DEFINE_SRCU(shrinker_srcu);
+static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
 
 #ifdef CONFIG_MEMCG
 static int shrinker_nr_max;
@@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
 	debugfs_entry = shrinker_debugfs_remove(shrinker);
 	mutex_unlock(&shrinker_mutex);
 
+	atomic_inc(&shrinker_srcu_generation);
 	synchronize_srcu(&shrinker_srcu);
 
 	debugfs_remove_recursive(debugfs_entry);
@@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
  */
 void synchronize_shrinkers(void)
 {
+	atomic_inc(&shrinker_srcu_generation);
 	synchronize_srcu(&shrinker_srcu);
 }
 EXPORT_SYMBOL(synchronize_shrinkers);
@@ -908,7 +911,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 {
 	struct shrinker_info *info;
 	unsigned long ret, freed = 0;
-	int srcu_idx;
+	int srcu_idx, generation;
 	int i;
 
 	if (!mem_cgroup_online(memcg))
@@ -919,6 +922,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 	if (unlikely(!info))
 		goto unlock;
 
+	generation = atomic_read(&shrinker_srcu_generation);
 	for_each_set_bit(i, info->map, info->map_nr_max) {
 		struct shrink_control sc = {
 			.gfp_mask = gfp_mask,
@@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 				set_shrinker_bit(memcg, nid, i);
 		}
 		freed += ret;
+
+		if (atomic_read(&shrinker_srcu_generation) != generation) {
+			freed = freed ? : 1;
+			break;
+		}
 	}
 unlock:
 	srcu_read_unlock(&shrinker_srcu, srcu_idx);
@@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 {
 	unsigned long ret, freed = 0;
 	struct shrinker *shrinker;
-	int srcu_idx;
+	int srcu_idx, generation;
 
 	/*
 	 * The root memcg might be allocated even though memcg is disabled
@@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
 
 	srcu_idx = srcu_read_lock(&shrinker_srcu);
+	generation = atomic_read(&shrinker_srcu_generation);
 
 	list_for_each_entry_srcu(shrinker, &shrinker_list, list,
 				 srcu_read_lock_held(&shrinker_srcu)) {
@@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 		if (ret == SHRINK_EMPTY)
 			ret = 0;
 		freed += ret;
+
+		if (atomic_read(&shrinker_srcu_generation) != generation) {
+			freed = freed ? : 1;
+			break;
+		}
 	}
 
 	srcu_read_unlock(&shrinker_srcu, srcu_idx);

Kirill
Kirill Tkhai Feb. 24, 2023, 9:14 p.m. UTC | #12
On 25.02.2023 00:02, Kirill Tkhai wrote:
> On 24.02.2023 07:00, Qi Zheng wrote:
>>
>>
>> On 2023/2/24 02:24, Sultan Alsawaf wrote:
>>> On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
>>>> The shrinker_rwsem is a global lock in shrinkers subsystem,
>>>> it is easy to cause blocking in the following cases:
>>>>
>>>> a. the write lock of shrinker_rwsem was held for too long.
>>>>     For example, there are many memcgs in the system, which
>>>>     causes some paths to hold locks and traverse it for too
>>>>     long. (e.g. expand_shrinker_info())
>>>> b. the read lock of shrinker_rwsem was held for too long,
>>>>     and a writer came at this time. Then this writer will be
>>>>     forced to wait and block all subsequent readers.
>>>>     For example:
>>>>     - be scheduled when the read lock of shrinker_rwsem is
>>>>       held in do_shrink_slab()
>>>>     - some shrinker are blocked for too long. Like the case
>>>>       mentioned in the patchset[1].
>>>>
>>>> Therefore, many times in history ([2],[3],[4],[5]), some
>>>> people wanted to replace shrinker_rwsem reader with SRCU,
>>>> but they all gave up because SRCU was not unconditionally
>>>> enabled.
>>>>
>>>> But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
>>>> the SRCU is unconditionally enabled. So it's time to use
>>>> SRCU to protect readers who previously held shrinker_rwsem.
>>>>
>>>> [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
>>>> [2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
>>>> [3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
>>>> [4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
>>>> [5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
>>>>
>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>> ---
>>>>   mm/vmscan.c | 27 +++++++++++----------------
>>>>   1 file changed, 11 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>> index 9f895ca6216c..02987a6f95d1 100644
>>>> --- a/mm/vmscan.c
>>>> +++ b/mm/vmscan.c
>>>> @@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>>     LIST_HEAD(shrinker_list);
>>>>   DECLARE_RWSEM(shrinker_rwsem);
>>>> +DEFINE_SRCU(shrinker_srcu);
>>>>     #ifdef CONFIG_MEMCG
>>>>   static int shrinker_nr_max;
>>>> @@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
>>>>   void register_shrinker_prepared(struct shrinker *shrinker)
>>>>   {
>>>>       down_write(&shrinker_rwsem);
>>>> -    list_add_tail(&shrinker->list, &shrinker_list);
>>>> +    list_add_tail_rcu(&shrinker->list, &shrinker_list);
>>>>       shrinker->flags |= SHRINKER_REGISTERED;
>>>>       shrinker_debugfs_add(shrinker);
>>>>       up_write(&shrinker_rwsem);
>>>> @@ -760,13 +761,15 @@ void unregister_shrinker(struct shrinker *shrinker)
>>>>           return;
>>>>         down_write(&shrinker_rwsem);
>>>> -    list_del(&shrinker->list);
>>>> +    list_del_rcu(&shrinker->list);
>>>>       shrinker->flags &= ~SHRINKER_REGISTERED;
>>>>       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
>>>>           unregister_memcg_shrinker(shrinker);
>>>>       debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>>       up_write(&shrinker_rwsem);
>>>>   +    synchronize_srcu(&shrinker_srcu);
>>>> +
>>>>       debugfs_remove_recursive(debugfs_entry);
>>>>         kfree(shrinker->nr_deferred);
>>>> @@ -786,6 +789,7 @@ void synchronize_shrinkers(void)
>>>>   {
>>>>       down_write(&shrinker_rwsem);
>>>>       up_write(&shrinker_rwsem);
>>>> +    synchronize_srcu(&shrinker_srcu);
>>>>   }
>>>>   EXPORT_SYMBOL(synchronize_shrinkers);
>>>>   @@ -996,6 +1000,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>   {
>>>>       unsigned long ret, freed = 0;
>>>>       struct shrinker *shrinker;
>>>> +    int srcu_idx;
>>>>         /*
>>>>        * The root memcg might be allocated even though memcg is disabled
>>>> @@ -1007,10 +1012,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>       if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>>>>           return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>>   -    if (!down_read_trylock(&shrinker_rwsem))
>>>> -        goto out;
>>>> +    srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>   -    list_for_each_entry(shrinker, &shrinker_list, list) {
>>>> +    list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>> +                 srcu_read_lock_held(&shrinker_srcu)) {
>>>>           struct shrink_control sc = {
>>>>               .gfp_mask = gfp_mask,
>>>>               .nid = nid,
>>>> @@ -1021,19 +1026,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>           if (ret == SHRINK_EMPTY)
>>>>               ret = 0;
>>>>           freed += ret;
>>>> -        /*
>>>> -         * Bail out if someone want to register a new shrinker to
>>>> -         * prevent the registration from being stalled for long periods
>>>> -         * by parallel ongoing shrinking.
>>>> -         */
>>>> -        if (rwsem_is_contended(&shrinker_rwsem)) {
>>>> -            freed = freed ? : 1;
>>>> -            break;
>>>> -        }
>>>>       }
>>>>   -    up_read(&shrinker_rwsem);
>>>> -out:
>>>> +    srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>       cond_resched();
>>>>       return freed;
>>>>   }
>>>> -- 
>>>> 2.20.1
>>>>
>>>>
>>>
>>> Hi Qi,
>>>
>>> A different problem I realized after my old attempt to use SRCU was that the
>>> unregister_shrinker() path became quite slow due to the heavy synchronize_srcu()
>>> call. Both register_shrinker() *and* unregister_shrinker() are called frequently
>>> these days, and SRCU is too unfair to the unregister path IMO.
>>
>> Hi Sultan,
>>
>> IIUC, for unregister_shrinker(), the wait time is hardly longer with
>> SRCU than with shrinker_rwsem before.
>>
>> And I just did a simple test. After using the script in cover letter to
>> increase the shrink_slab hotspot, I did umount 1k times at the same
>> time, and then I used bpftrace to measure the time consumption of
>> unregister_shrinker() as follows:
>>
>> bpftrace -e 'kprobe:unregister_shrinker { @start[tid] = nsecs; } kretprobe:unregister_shrinker /@start[tid]/ { @ns[comm] = hist(nsecs - @start[tid]); delete(@start[tid]); }'
>>
>> @ns[umount]:
>> [16K, 32K)             3 |      |
>> [32K, 64K)            66 |@@@@@@@@@@      |
>> [64K, 128K)           32 |@@@@@      |
>> [128K, 256K)          22 |@@@      |
>> [256K, 512K)          48 |@@@@@@@      |
>> [512K, 1M)            19 |@@@      |
>> [1M, 2M)             131 |@@@@@@@@@@@@@@@@@@@@@      |
>> [2M, 4M)             313 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>> [4M, 8M)             302 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  |
>> [8M, 16M)             55 |@@@@@@@@@
>>
>> I see that the highest time-consuming of unregister_shrinker() is between 8ms and 16ms, which feels tolerable?
> 
> The fundamental difference is that before the patchset this for_each_set_bit() iteration could be broken in the middle
> of two do_shrink_slab() calls, while after the patchset we can leave for_each_set_bit() only after visiting all set bits.
> 
> Using only synchronize_srcu_expedited() won't help here.
> 
> My opinion is we should restore a check similar to the rwsem_is_contendent() check that we had before. Something like
> the below on top of your patchset merged into appropriate patch:
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 27ef9946ae8a..50e7812468ec 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>  LIST_HEAD(shrinker_list);
>  DEFINE_MUTEX(shrinker_mutex);
>  DEFINE_SRCU(shrinker_srcu);
> +static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
>  
>  #ifdef CONFIG_MEMCG
>  static int shrinker_nr_max;
> @@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
>  	debugfs_entry = shrinker_debugfs_remove(shrinker);
>  	mutex_unlock(&shrinker_mutex);
>  
> +	atomic_inc(&shrinker_srcu_generation);
>  	synchronize_srcu(&shrinker_srcu);
>  
>  	debugfs_remove_recursive(debugfs_entry);
> @@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
>   */
>  void synchronize_shrinkers(void)
>  {
> +	atomic_inc(&shrinker_srcu_generation);
>  	synchronize_srcu(&shrinker_srcu);
>  }
>  EXPORT_SYMBOL(synchronize_shrinkers);
> @@ -908,7 +911,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>  {
>  	struct shrinker_info *info;
>  	unsigned long ret, freed = 0;
> -	int srcu_idx;
> +	int srcu_idx, generation;
>  	int i;
>  
>  	if (!mem_cgroup_online(memcg))
> @@ -919,6 +922,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>  	if (unlikely(!info))
>  		goto unlock;
>  
> +	generation = atomic_read(&shrinker_srcu_generation);
>  	for_each_set_bit(i, info->map, info->map_nr_max) {
>  		struct shrink_control sc = {
>  			.gfp_mask = gfp_mask,
> @@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>  				set_shrinker_bit(memcg, nid, i);
>  		}
>  		freed += ret;
> +
> +		if (atomic_read(&shrinker_srcu_generation) != generation) {
> +			freed = freed ? : 1;
> +			break;
> +		}
>  	}
>  unlock:
>  	srcu_read_unlock(&shrinker_srcu, srcu_idx);
> @@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>  {
>  	unsigned long ret, freed = 0;
>  	struct shrinker *shrinker;
> -	int srcu_idx;
> +	int srcu_idx, generation;
>  
>  	/*
>  	 * The root memcg might be allocated even though memcg is disabled
> @@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>  		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>  
>  	srcu_idx = srcu_read_lock(&shrinker_srcu);
> +	generation = atomic_read(&shrinker_srcu_generation);
>  
>  	list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>  				 srcu_read_lock_held(&shrinker_srcu)) {
> @@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>  		if (ret == SHRINK_EMPTY)
>  			ret = 0;
>  		freed += ret;
> +
> +		if (atomic_read(&shrinker_srcu_generation) != generation) {
> +			freed = freed ? : 1;
> +			break;
> +		}
>  	}
>  
>  	srcu_read_unlock(&shrinker_srcu, srcu_idx);

Even more, for memcg shrinkers we may unlock SRCU and continue iterations from the same shrinker id:

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 27ef9946ae8a..0b197bba1257 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
 LIST_HEAD(shrinker_list);
 DEFINE_MUTEX(shrinker_mutex);
 DEFINE_SRCU(shrinker_srcu);
+static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
 
 #ifdef CONFIG_MEMCG
 static int shrinker_nr_max;
@@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
 	debugfs_entry = shrinker_debugfs_remove(shrinker);
 	mutex_unlock(&shrinker_mutex);
 
+	atomic_inc(&shrinker_srcu_generation);
 	synchronize_srcu(&shrinker_srcu);
 
 	debugfs_remove_recursive(debugfs_entry);
@@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
  */
 void synchronize_shrinkers(void)
 {
+	atomic_inc(&shrinker_srcu_generation);
 	synchronize_srcu(&shrinker_srcu);
 }
 EXPORT_SYMBOL(synchronize_shrinkers);
@@ -908,18 +911,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 {
 	struct shrinker_info *info;
 	unsigned long ret, freed = 0;
-	int srcu_idx;
-	int i;
+	int srcu_idx, generation;
+	int i = 0;
 
 	if (!mem_cgroup_online(memcg))
 		return 0;
-
+again:
 	srcu_idx = srcu_read_lock(&shrinker_srcu);
 	info = shrinker_info_srcu(memcg, nid);
 	if (unlikely(!info))
 		goto unlock;
 
-	for_each_set_bit(i, info->map, info->map_nr_max) {
+	generation = atomic_read(&shrinker_srcu_generation);
+	for_each_set_bit_from(i, info->map, info->map_nr_max) {
 		struct shrink_control sc = {
 			.gfp_mask = gfp_mask,
 			.nid = nid,
@@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 				set_shrinker_bit(memcg, nid, i);
 		}
 		freed += ret;
+
+		if (atomic_read(&shrinker_srcu_generation) != generation) {
+			srcu_read_unlock(&shrinker_srcu, srcu_idx);
+			goto again;
+		}
 	}
 unlock:
 	srcu_read_unlock(&shrinker_srcu, srcu_idx);
@@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 {
 	unsigned long ret, freed = 0;
 	struct shrinker *shrinker;
-	int srcu_idx;
+	int srcu_idx, generation;
 
 	/*
 	 * The root memcg might be allocated even though memcg is disabled
@@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
 
 	srcu_idx = srcu_read_lock(&shrinker_srcu);
+	generation = atomic_read(&shrinker_srcu_generation);
 
 	list_for_each_entry_srcu(shrinker, &shrinker_list, list,
 				 srcu_read_lock_held(&shrinker_srcu)) {
@@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 		if (ret == SHRINK_EMPTY)
 			ret = 0;
 		freed += ret;
+
+		if (atomic_read(&shrinker_srcu_generation) != generation) {
+			freed = freed ? : 1;
+			break;
+		}
 	}
 
 	srcu_read_unlock(&shrinker_srcu, srcu_idx);
Qi Zheng Feb. 25, 2023, 8:08 a.m. UTC | #13
On 2023/2/25 05:14, Kirill Tkhai wrote:
> On 25.02.2023 00:02, Kirill Tkhai wrote:
>> On 24.02.2023 07:00, Qi Zheng wrote:
>>>
>>>
>>> On 2023/2/24 02:24, Sultan Alsawaf wrote:
>>>> On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
>>>>> The shrinker_rwsem is a global lock in shrinkers subsystem,
>>>>> it is easy to cause blocking in the following cases:
>>>>>
>>>>> a. the write lock of shrinker_rwsem was held for too long.
>>>>>      For example, there are many memcgs in the system, which
>>>>>      causes some paths to hold locks and traverse it for too
>>>>>      long. (e.g. expand_shrinker_info())
>>>>> b. the read lock of shrinker_rwsem was held for too long,
>>>>>      and a writer came at this time. Then this writer will be
>>>>>      forced to wait and block all subsequent readers.
>>>>>      For example:
>>>>>      - be scheduled when the read lock of shrinker_rwsem is
>>>>>        held in do_shrink_slab()
>>>>>      - some shrinker are blocked for too long. Like the case
>>>>>        mentioned in the patchset[1].
>>>>>
>>>>> Therefore, many times in history ([2],[3],[4],[5]), some
>>>>> people wanted to replace shrinker_rwsem reader with SRCU,
>>>>> but they all gave up because SRCU was not unconditionally
>>>>> enabled.
>>>>>
>>>>> But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
>>>>> the SRCU is unconditionally enabled. So it's time to use
>>>>> SRCU to protect readers who previously held shrinker_rwsem.
>>>>>
>>>>> [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
>>>>> [2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
>>>>> [3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
>>>>> [4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
>>>>> [5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
>>>>>
>>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>> ---
>>>>>    mm/vmscan.c | 27 +++++++++++----------------
>>>>>    1 file changed, 11 insertions(+), 16 deletions(-)
>>>>>
>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>>> index 9f895ca6216c..02987a6f95d1 100644
>>>>> --- a/mm/vmscan.c
>>>>> +++ b/mm/vmscan.c
>>>>> @@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>>>      LIST_HEAD(shrinker_list);
>>>>>    DECLARE_RWSEM(shrinker_rwsem);
>>>>> +DEFINE_SRCU(shrinker_srcu);
>>>>>      #ifdef CONFIG_MEMCG
>>>>>    static int shrinker_nr_max;
>>>>> @@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
>>>>>    void register_shrinker_prepared(struct shrinker *shrinker)
>>>>>    {
>>>>>        down_write(&shrinker_rwsem);
>>>>> -    list_add_tail(&shrinker->list, &shrinker_list);
>>>>> +    list_add_tail_rcu(&shrinker->list, &shrinker_list);
>>>>>        shrinker->flags |= SHRINKER_REGISTERED;
>>>>>        shrinker_debugfs_add(shrinker);
>>>>>        up_write(&shrinker_rwsem);
>>>>> @@ -760,13 +761,15 @@ void unregister_shrinker(struct shrinker *shrinker)
>>>>>            return;
>>>>>          down_write(&shrinker_rwsem);
>>>>> -    list_del(&shrinker->list);
>>>>> +    list_del_rcu(&shrinker->list);
>>>>>        shrinker->flags &= ~SHRINKER_REGISTERED;
>>>>>        if (shrinker->flags & SHRINKER_MEMCG_AWARE)
>>>>>            unregister_memcg_shrinker(shrinker);
>>>>>        debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>>>        up_write(&shrinker_rwsem);
>>>>>    +    synchronize_srcu(&shrinker_srcu);
>>>>> +
>>>>>        debugfs_remove_recursive(debugfs_entry);
>>>>>          kfree(shrinker->nr_deferred);
>>>>> @@ -786,6 +789,7 @@ void synchronize_shrinkers(void)
>>>>>    {
>>>>>        down_write(&shrinker_rwsem);
>>>>>        up_write(&shrinker_rwsem);
>>>>> +    synchronize_srcu(&shrinker_srcu);
>>>>>    }
>>>>>    EXPORT_SYMBOL(synchronize_shrinkers);
>>>>>    @@ -996,6 +1000,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>    {
>>>>>        unsigned long ret, freed = 0;
>>>>>        struct shrinker *shrinker;
>>>>> +    int srcu_idx;
>>>>>          /*
>>>>>         * The root memcg might be allocated even though memcg is disabled
>>>>> @@ -1007,10 +1012,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>        if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>>>>>            return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>>>    -    if (!down_read_trylock(&shrinker_rwsem))
>>>>> -        goto out;
>>>>> +    srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>>    -    list_for_each_entry(shrinker, &shrinker_list, list) {
>>>>> +    list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>>> +                 srcu_read_lock_held(&shrinker_srcu)) {
>>>>>            struct shrink_control sc = {
>>>>>                .gfp_mask = gfp_mask,
>>>>>                .nid = nid,
>>>>> @@ -1021,19 +1026,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>            if (ret == SHRINK_EMPTY)
>>>>>                ret = 0;
>>>>>            freed += ret;
>>>>> -        /*
>>>>> -         * Bail out if someone want to register a new shrinker to
>>>>> -         * prevent the registration from being stalled for long periods
>>>>> -         * by parallel ongoing shrinking.
>>>>> -         */
>>>>> -        if (rwsem_is_contended(&shrinker_rwsem)) {
>>>>> -            freed = freed ? : 1;
>>>>> -            break;
>>>>> -        }
>>>>>        }
>>>>>    -    up_read(&shrinker_rwsem);
>>>>> -out:
>>>>> +    srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>>        cond_resched();
>>>>>        return freed;
>>>>>    }
>>>>> -- 
>>>>> 2.20.1
>>>>>
>>>>>
>>>>
>>>> Hi Qi,
>>>>
>>>> A different problem I realized after my old attempt to use SRCU was that the
>>>> unregister_shrinker() path became quite slow due to the heavy synchronize_srcu()
>>>> call. Both register_shrinker() *and* unregister_shrinker() are called frequently
>>>> these days, and SRCU is too unfair to the unregister path IMO.
>>>
>>> Hi Sultan,
>>>
>>> IIUC, for unregister_shrinker(), the wait time is hardly longer with
>>> SRCU than with shrinker_rwsem before.
>>>
>>> And I just did a simple test. After using the script in cover letter to
>>> increase the shrink_slab hotspot, I did umount 1k times at the same
>>> time, and then I used bpftrace to measure the time consumption of
>>> unregister_shrinker() as follows:
>>>
>>> bpftrace -e 'kprobe:unregister_shrinker { @start[tid] = nsecs; } kretprobe:unregister_shrinker /@start[tid]/ { @ns[comm] = hist(nsecs - @start[tid]); delete(@start[tid]); }'
>>>
>>> @ns[umount]:
>>> [16K, 32K)             3 |      |
>>> [32K, 64K)            66 |@@@@@@@@@@      |
>>> [64K, 128K)           32 |@@@@@      |
>>> [128K, 256K)          22 |@@@      |
>>> [256K, 512K)          48 |@@@@@@@      |
>>> [512K, 1M)            19 |@@@      |
>>> [1M, 2M)             131 |@@@@@@@@@@@@@@@@@@@@@      |
>>> [2M, 4M)             313 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>>> [4M, 8M)             302 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  |
>>> [8M, 16M)             55 |@@@@@@@@@
>>>
>>> I see that the highest time-consuming of unregister_shrinker() is between 8ms and 16ms, which feels tolerable?

Hi Kirill,

>>
>> The fundamental difference is that before the patchset this for_each_set_bit() iteration could be broken in the middle
>> of two do_shrink_slab() calls, while after the patchset we can leave for_each_set_bit() only after visiting all set bits.

After looking at the git log[1], I saw that we originally introduced
rwsem_is_contendent() here to aviod blocking register_shrinker(),
not unregister_shrinker().

So I am curious, do we really care about the speed of
unregister_shrinker()?

And after using SRCU, register_shrinker() will not be blocked by slab
shrink at all.

[1]. https://github.com/torvalds/linux/commit/e496612

>>
>> Using only synchronize_srcu_expedited() won't help here.
>>
>> My opinion is we should restore a check similar to the rwsem_is_contendent() check that we had before. Something like

If we really care about the speed of unregister_shrinker() like
register_shrinker(), I think this is a good idea. This guarantees
at least the speed of the unregister_shrinker() is not deteriorated. :)

>> the below on top of your patchset merged into appropriate patch:
>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 27ef9946ae8a..50e7812468ec 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>   LIST_HEAD(shrinker_list);
>>   DEFINE_MUTEX(shrinker_mutex);
>>   DEFINE_SRCU(shrinker_srcu);
>> +static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
>>   
>>   #ifdef CONFIG_MEMCG
>>   static int shrinker_nr_max;
>> @@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
>>   	debugfs_entry = shrinker_debugfs_remove(shrinker);
>>   	mutex_unlock(&shrinker_mutex);
>>   
>> +	atomic_inc(&shrinker_srcu_generation);
>>   	synchronize_srcu(&shrinker_srcu);
>>   
>>   	debugfs_remove_recursive(debugfs_entry);
>> @@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
>>    */
>>   void synchronize_shrinkers(void)
>>   {
>> +	atomic_inc(&shrinker_srcu_generation);
>>   	synchronize_srcu(&shrinker_srcu);
>>   }
>>   EXPORT_SYMBOL(synchronize_shrinkers);
>> @@ -908,7 +911,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>   {
>>   	struct shrinker_info *info;
>>   	unsigned long ret, freed = 0;
>> -	int srcu_idx;
>> +	int srcu_idx, generation;
>>   	int i;
>>   
>>   	if (!mem_cgroup_online(memcg))
>> @@ -919,6 +922,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>   	if (unlikely(!info))
>>   		goto unlock;
>>   
>> +	generation = atomic_read(&shrinker_srcu_generation);
>>   	for_each_set_bit(i, info->map, info->map_nr_max) {
>>   		struct shrink_control sc = {
>>   			.gfp_mask = gfp_mask,
>> @@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>   				set_shrinker_bit(memcg, nid, i);
>>   		}
>>   		freed += ret;
>> +
>> +		if (atomic_read(&shrinker_srcu_generation) != generation) {
>> +			freed = freed ? : 1;
>> +			break;
>> +		}
>>   	}
>>   unlock:
>>   	srcu_read_unlock(&shrinker_srcu, srcu_idx);
>> @@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>   {
>>   	unsigned long ret, freed = 0;
>>   	struct shrinker *shrinker;
>> -	int srcu_idx;
>> +	int srcu_idx, generation;
>>   
>>   	/*
>>   	 * The root memcg might be allocated even though memcg is disabled
>> @@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>   		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>   
>>   	srcu_idx = srcu_read_lock(&shrinker_srcu);
>> +	generation = atomic_read(&shrinker_srcu_generation);
>>   
>>   	list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>   				 srcu_read_lock_held(&shrinker_srcu)) {
>> @@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>   		if (ret == SHRINK_EMPTY)
>>   			ret = 0;
>>   		freed += ret;
>> +
>> +		if (atomic_read(&shrinker_srcu_generation) != generation) {
>> +			freed = freed ? : 1;
>> +			break;
>> +		}
>>   	}
>>   
>>   	srcu_read_unlock(&shrinker_srcu, srcu_idx);
> 
> Even more, for memcg shrinkers we may unlock SRCU and continue iterations from the same shrinker id:

Maybe we can also do this for global slab shrink? Like below:

diff --git a/mm/vmscan.c b/mm/vmscan.c
index ffddbd204259..9d8c53075298 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1012,7 +1012,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, 
int nid,
                                  int priority)
  {
         unsigned long ret, freed = 0;
-       struct shrinker *shrinker;
+       struct shrinker *shrinker = NULL;
         int srcu_idx, generation;

         /*
@@ -1025,11 +1025,15 @@ static unsigned long shrink_slab(gfp_t gfp_mask, 
int nid,
         if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
                 return shrink_slab_memcg(gfp_mask, nid, memcg, priority);

+again:
         srcu_idx = srcu_read_lock(&shrinker_srcu);

         generation = atomic_read(&shrinker_srcu_generation);
-       list_for_each_entry_srcu(shrinker, &shrinker_list, list,
-                                srcu_read_lock_held(&shrinker_srcu)) {
+       if (!shrinker)
+               shrinker = list_entry_rcu(shrinker_list.next, struct 
shrinker, list);
+       else
+               shrinker = list_entry_rcu(shrinker->list.next, struct 
shrinker, list);
+       list_for_each_entry_from_rcu(shrinker, &shrinker_list, list) {
                 struct shrink_control sc = {
                         .gfp_mask = gfp_mask,
                         .nid = nid,
@@ -1042,8 +1046,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, 
int nid,
                 freed += ret;

                 if (atomic_read(&shrinker_srcu_generation) != generation) {
-                       freed = freed ? : 1;
-                       break;
+                       srcu_read_unlock(&shrinker_srcu, srcu_idx);
+                       cond_resched();
+                       goto again;
                 }
         }

> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 27ef9946ae8a..0b197bba1257 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>   LIST_HEAD(shrinker_list);
>   DEFINE_MUTEX(shrinker_mutex);
>   DEFINE_SRCU(shrinker_srcu);
> +static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
>   
>   #ifdef CONFIG_MEMCG
>   static int shrinker_nr_max;
> @@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
>   	debugfs_entry = shrinker_debugfs_remove(shrinker);
>   	mutex_unlock(&shrinker_mutex);
>   
> +	atomic_inc(&shrinker_srcu_generation);
>   	synchronize_srcu(&shrinker_srcu);
>   
>   	debugfs_remove_recursive(debugfs_entry);
> @@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
>    */
>   void synchronize_shrinkers(void)
>   {
> +	atomic_inc(&shrinker_srcu_generation);
>   	synchronize_srcu(&shrinker_srcu);
>   }
>   EXPORT_SYMBOL(synchronize_shrinkers);
> @@ -908,18 +911,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>   {
>   	struct shrinker_info *info;
>   	unsigned long ret, freed = 0;
> -	int srcu_idx;
> -	int i;
> +	int srcu_idx, generation;
> +	int i = 0;
>   
>   	if (!mem_cgroup_online(memcg))
>   		return 0;
> -
> +again:
>   	srcu_idx = srcu_read_lock(&shrinker_srcu);
>   	info = shrinker_info_srcu(memcg, nid);
>   	if (unlikely(!info))
>   		goto unlock;
>   
> -	for_each_set_bit(i, info->map, info->map_nr_max) {
> +	generation = atomic_read(&shrinker_srcu_generation);
> +	for_each_set_bit_from(i, info->map, info->map_nr_max) {
>   		struct shrink_control sc = {
>   			.gfp_mask = gfp_mask,
>   			.nid = nid,
> @@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>   				set_shrinker_bit(memcg, nid, i);
>   		}
>   		freed += ret;
> +
> +		if (atomic_read(&shrinker_srcu_generation) != generation) {
> +			srcu_read_unlock(&shrinker_srcu, srcu_idx);

Maybe we can add the following code here, so as to avoid repeating the
current id and avoid triggering softlockup:

			i++;
			cond_resched();

Thanks,
Qi

> +			goto again;
> +		}
>   	}
>   unlock:
>   	srcu_read_unlock(&shrinker_srcu, srcu_idx);
> @@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>   {
>   	unsigned long ret, freed = 0;
>   	struct shrinker *shrinker;
> -	int srcu_idx;
> +	int srcu_idx, generation;
>   
>   	/*
>   	 * The root memcg might be allocated even though memcg is disabled
> @@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>   		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>   
>   	srcu_idx = srcu_read_lock(&shrinker_srcu);
> +	generation = atomic_read(&shrinker_srcu_generation);
>   
>   	list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>   				 srcu_read_lock_held(&shrinker_srcu)) {
> @@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>   		if (ret == SHRINK_EMPTY)
>   			ret = 0;
>   		freed += ret;
> +
> +		if (atomic_read(&shrinker_srcu_generation) != generation) {
> +			freed = freed ? : 1;
> +			break;
> +		}
>   	}
>   
>   	srcu_read_unlock(&shrinker_srcu, srcu_idx);
> 
>
Kirill Tkhai Feb. 25, 2023, 3:30 p.m. UTC | #14
On 25.02.2023 11:08, Qi Zheng wrote:
> 
> 
> On 2023/2/25 05:14, Kirill Tkhai wrote:
>> On 25.02.2023 00:02, Kirill Tkhai wrote:
>>> On 24.02.2023 07:00, Qi Zheng wrote:
>>>>
>>>>
>>>> On 2023/2/24 02:24, Sultan Alsawaf wrote:
>>>>> On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
>>>>>> The shrinker_rwsem is a global lock in shrinkers subsystem,
>>>>>> it is easy to cause blocking in the following cases:
>>>>>>
>>>>>> a. the write lock of shrinker_rwsem was held for too long.
>>>>>>      For example, there are many memcgs in the system, which
>>>>>>      causes some paths to hold locks and traverse it for too
>>>>>>      long. (e.g. expand_shrinker_info())
>>>>>> b. the read lock of shrinker_rwsem was held for too long,
>>>>>>      and a writer came at this time. Then this writer will be
>>>>>>      forced to wait and block all subsequent readers.
>>>>>>      For example:
>>>>>>      - be scheduled when the read lock of shrinker_rwsem is
>>>>>>        held in do_shrink_slab()
>>>>>>      - some shrinker are blocked for too long. Like the case
>>>>>>        mentioned in the patchset[1].
>>>>>>
>>>>>> Therefore, many times in history ([2],[3],[4],[5]), some
>>>>>> people wanted to replace shrinker_rwsem reader with SRCU,
>>>>>> but they all gave up because SRCU was not unconditionally
>>>>>> enabled.
>>>>>>
>>>>>> But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
>>>>>> the SRCU is unconditionally enabled. So it's time to use
>>>>>> SRCU to protect readers who previously held shrinker_rwsem.
>>>>>>
>>>>>> [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
>>>>>> [2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
>>>>>> [3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
>>>>>> [4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
>>>>>> [5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
>>>>>>
>>>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>>> ---
>>>>>>    mm/vmscan.c | 27 +++++++++++----------------
>>>>>>    1 file changed, 11 insertions(+), 16 deletions(-)
>>>>>>
>>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>>>> index 9f895ca6216c..02987a6f95d1 100644
>>>>>> --- a/mm/vmscan.c
>>>>>> +++ b/mm/vmscan.c
>>>>>> @@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>>>>      LIST_HEAD(shrinker_list);
>>>>>>    DECLARE_RWSEM(shrinker_rwsem);
>>>>>> +DEFINE_SRCU(shrinker_srcu);
>>>>>>      #ifdef CONFIG_MEMCG
>>>>>>    static int shrinker_nr_max;
>>>>>> @@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
>>>>>>    void register_shrinker_prepared(struct shrinker *shrinker)
>>>>>>    {
>>>>>>        down_write(&shrinker_rwsem);
>>>>>> -    list_add_tail(&shrinker->list, &shrinker_list);
>>>>>> +    list_add_tail_rcu(&shrinker->list, &shrinker_list);
>>>>>>        shrinker->flags |= SHRINKER_REGISTERED;
>>>>>>        shrinker_debugfs_add(shrinker);
>>>>>>        up_write(&shrinker_rwsem);
>>>>>> @@ -760,13 +761,15 @@ void unregister_shrinker(struct shrinker *shrinker)
>>>>>>            return;
>>>>>>          down_write(&shrinker_rwsem);
>>>>>> -    list_del(&shrinker->list);
>>>>>> +    list_del_rcu(&shrinker->list);
>>>>>>        shrinker->flags &= ~SHRINKER_REGISTERED;
>>>>>>        if (shrinker->flags & SHRINKER_MEMCG_AWARE)
>>>>>>            unregister_memcg_shrinker(shrinker);
>>>>>>        debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>>>>        up_write(&shrinker_rwsem);
>>>>>>    +    synchronize_srcu(&shrinker_srcu);
>>>>>> +
>>>>>>        debugfs_remove_recursive(debugfs_entry);
>>>>>>          kfree(shrinker->nr_deferred);
>>>>>> @@ -786,6 +789,7 @@ void synchronize_shrinkers(void)
>>>>>>    {
>>>>>>        down_write(&shrinker_rwsem);
>>>>>>        up_write(&shrinker_rwsem);
>>>>>> +    synchronize_srcu(&shrinker_srcu);
>>>>>>    }
>>>>>>    EXPORT_SYMBOL(synchronize_shrinkers);
>>>>>>    @@ -996,6 +1000,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>    {
>>>>>>        unsigned long ret, freed = 0;
>>>>>>        struct shrinker *shrinker;
>>>>>> +    int srcu_idx;
>>>>>>          /*
>>>>>>         * The root memcg might be allocated even though memcg is disabled
>>>>>> @@ -1007,10 +1012,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>        if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>>>>>>            return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>>>>    -    if (!down_read_trylock(&shrinker_rwsem))
>>>>>> -        goto out;
>>>>>> +    srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>>>    -    list_for_each_entry(shrinker, &shrinker_list, list) {
>>>>>> +    list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>>>> +                 srcu_read_lock_held(&shrinker_srcu)) {
>>>>>>            struct shrink_control sc = {
>>>>>>                .gfp_mask = gfp_mask,
>>>>>>                .nid = nid,
>>>>>> @@ -1021,19 +1026,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>            if (ret == SHRINK_EMPTY)
>>>>>>                ret = 0;
>>>>>>            freed += ret;
>>>>>> -        /*
>>>>>> -         * Bail out if someone want to register a new shrinker to
>>>>>> -         * prevent the registration from being stalled for long periods
>>>>>> -         * by parallel ongoing shrinking.
>>>>>> -         */
>>>>>> -        if (rwsem_is_contended(&shrinker_rwsem)) {
>>>>>> -            freed = freed ? : 1;
>>>>>> -            break;
>>>>>> -        }
>>>>>>        }
>>>>>>    -    up_read(&shrinker_rwsem);
>>>>>> -out:
>>>>>> +    srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>>>        cond_resched();
>>>>>>        return freed;
>>>>>>    }
>>>>>> -- 
>>>>>> 2.20.1
>>>>>>
>>>>>>
>>>>>
>>>>> Hi Qi,
>>>>>
>>>>> A different problem I realized after my old attempt to use SRCU was that the
>>>>> unregister_shrinker() path became quite slow due to the heavy synchronize_srcu()
>>>>> call. Both register_shrinker() *and* unregister_shrinker() are called frequently
>>>>> these days, and SRCU is too unfair to the unregister path IMO.
>>>>
>>>> Hi Sultan,
>>>>
>>>> IIUC, for unregister_shrinker(), the wait time is hardly longer with
>>>> SRCU than with shrinker_rwsem before.
>>>>
>>>> And I just did a simple test. After using the script in cover letter to
>>>> increase the shrink_slab hotspot, I did umount 1k times at the same
>>>> time, and then I used bpftrace to measure the time consumption of
>>>> unregister_shrinker() as follows:
>>>>
>>>> bpftrace -e 'kprobe:unregister_shrinker { @start[tid] = nsecs; } kretprobe:unregister_shrinker /@start[tid]/ { @ns[comm] = hist(nsecs - @start[tid]); delete(@start[tid]); }'
>>>>
>>>> @ns[umount]:
>>>> [16K, 32K)             3 |      |
>>>> [32K, 64K)            66 |@@@@@@@@@@      |
>>>> [64K, 128K)           32 |@@@@@      |
>>>> [128K, 256K)          22 |@@@      |
>>>> [256K, 512K)          48 |@@@@@@@      |
>>>> [512K, 1M)            19 |@@@      |
>>>> [1M, 2M)             131 |@@@@@@@@@@@@@@@@@@@@@      |
>>>> [2M, 4M)             313 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>>>> [4M, 8M)             302 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  |
>>>> [8M, 16M)             55 |@@@@@@@@@
>>>>
>>>> I see that the highest time-consuming of unregister_shrinker() is between 8ms and 16ms, which feels tolerable?
> 
> Hi Kirill,
> 
>>>
>>> The fundamental difference is that before the patchset this for_each_set_bit() iteration could be broken in the middle
>>> of two do_shrink_slab() calls, while after the patchset we can leave for_each_set_bit() only after visiting all set bits.
> 
> After looking at the git log[1], I saw that we originally introduced
> rwsem_is_contendent() here to aviod blocking register_shrinker(),
> not unregister_shrinker().
> 
> So I am curious, do we really care about the speed of
> unregister_shrinker()?

My opinion is that for general reasons we should avoid long unbreakable actions in kernel. Especially when they may be called
synchronous from userspace.

We even have this as generic rule. See check_hung_task().

Before, the longest sleep in unregister_shrinker() was a sleep waiting for single longest do_shrink_slab().

After the patch the longest sleep will be a sleep waiting for all do_shrink_slab() calls (all set bits in shrinker_info).

> And after using SRCU, register_shrinker() will not be blocked by slab
> shrink at all.
> 
> [1]. https://github.com/torvalds/linux/commit/e496612
> 
>>>
>>> Using only synchronize_srcu_expedited() won't help here.
>>>
>>> My opinion is we should restore a check similar to the rwsem_is_contendent() check that we had before. Something like
> 
> If we really care about the speed of unregister_shrinker() like
> register_shrinker(), I think this is a good idea. This guarantees
> at least the speed of the unregister_shrinker() is not deteriorated. :)
> 
>>> the below on top of your patchset merged into appropriate patch:
>>>
>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>> index 27ef9946ae8a..50e7812468ec 100644
>>> --- a/mm/vmscan.c
>>> +++ b/mm/vmscan.c
>>> @@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>   LIST_HEAD(shrinker_list);
>>>   DEFINE_MUTEX(shrinker_mutex);
>>>   DEFINE_SRCU(shrinker_srcu);
>>> +static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
>>>     #ifdef CONFIG_MEMCG
>>>   static int shrinker_nr_max;
>>> @@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
>>>       debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>       mutex_unlock(&shrinker_mutex);
>>>   +    atomic_inc(&shrinker_srcu_generation);
>>>       synchronize_srcu(&shrinker_srcu);
>>>         debugfs_remove_recursive(debugfs_entry);
>>> @@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
>>>    */
>>>   void synchronize_shrinkers(void)
>>>   {
>>> +    atomic_inc(&shrinker_srcu_generation);
>>>       synchronize_srcu(&shrinker_srcu);
>>>   }
>>>   EXPORT_SYMBOL(synchronize_shrinkers);
>>> @@ -908,7 +911,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>   {
>>>       struct shrinker_info *info;
>>>       unsigned long ret, freed = 0;
>>> -    int srcu_idx;
>>> +    int srcu_idx, generation;
>>>       int i;
>>>         if (!mem_cgroup_online(memcg))
>>> @@ -919,6 +922,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>       if (unlikely(!info))
>>>           goto unlock;
>>>   +    generation = atomic_read(&shrinker_srcu_generation);
>>>       for_each_set_bit(i, info->map, info->map_nr_max) {
>>>           struct shrink_control sc = {
>>>               .gfp_mask = gfp_mask,
>>> @@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>                   set_shrinker_bit(memcg, nid, i);
>>>           }
>>>           freed += ret;
>>> +
>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>> +            freed = freed ? : 1;
>>> +            break;
>>> +        }
>>>       }
>>>   unlock:
>>>       srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>> @@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>   {
>>>       unsigned long ret, freed = 0;
>>>       struct shrinker *shrinker;
>>> -    int srcu_idx;
>>> +    int srcu_idx, generation;
>>>         /*
>>>        * The root memcg might be allocated even though memcg is disabled
>>> @@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>           return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>         srcu_idx = srcu_read_lock(&shrinker_srcu);
>>> +    generation = atomic_read(&shrinker_srcu_generation);
>>>         list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>                    srcu_read_lock_held(&shrinker_srcu)) {
>>> @@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>           if (ret == SHRINK_EMPTY)
>>>               ret = 0;
>>>           freed += ret;
>>> +
>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>> +            freed = freed ? : 1;
>>> +            break;
>>> +        }
>>>       }
>>>         srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>
>> Even more, for memcg shrinkers we may unlock SRCU and continue iterations from the same shrinker id:
> 
> Maybe we can also do this for global slab shrink? Like below:
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index ffddbd204259..9d8c53075298 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1012,7 +1012,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>                                  int priority)
>  {
>         unsigned long ret, freed = 0;
> -       struct shrinker *shrinker;
> +       struct shrinker *shrinker = NULL;
>         int srcu_idx, generation;
> 
>         /*
> @@ -1025,11 +1025,15 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>         if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>                 return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
> 
> +again:
>         srcu_idx = srcu_read_lock(&shrinker_srcu);
> 
>         generation = atomic_read(&shrinker_srcu_generation);
> -       list_for_each_entry_srcu(shrinker, &shrinker_list, list,
> -                                srcu_read_lock_held(&shrinker_srcu)) {
> +       if (!shrinker)
> +               shrinker = list_entry_rcu(shrinker_list.next, struct shrinker, list);
> +       else
> +               shrinker = list_entry_rcu(shrinker->list.next, struct shrinker, list);
> +       list_for_each_entry_from_rcu(shrinker, &shrinker_list, list) {
>                 struct shrink_control sc = {
>                         .gfp_mask = gfp_mask,
>                         .nid = nid,
> @@ -1042,8 +1046,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>                 freed += ret;
> 
>                 if (atomic_read(&shrinker_srcu_generation) != generation) {
> -                       freed = freed ? : 1;
> -                       break;
> +                       srcu_read_unlock(&shrinker_srcu, srcu_idx);
> +                       cond_resched();
> +                       goto again;
>                 }
>         }
> 
>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 27ef9946ae8a..0b197bba1257 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>   LIST_HEAD(shrinker_list);
>>   DEFINE_MUTEX(shrinker_mutex);
>>   DEFINE_SRCU(shrinker_srcu);
>> +static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
>>     #ifdef CONFIG_MEMCG
>>   static int shrinker_nr_max;
>> @@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
>>       debugfs_entry = shrinker_debugfs_remove(shrinker);
>>       mutex_unlock(&shrinker_mutex);
>>   +    atomic_inc(&shrinker_srcu_generation);
>>       synchronize_srcu(&shrinker_srcu);
>>         debugfs_remove_recursive(debugfs_entry);
>> @@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
>>    */
>>   void synchronize_shrinkers(void)
>>   {
>> +    atomic_inc(&shrinker_srcu_generation);
>>       synchronize_srcu(&shrinker_srcu);
>>   }
>>   EXPORT_SYMBOL(synchronize_shrinkers);
>> @@ -908,18 +911,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>   {
>>       struct shrinker_info *info;
>>       unsigned long ret, freed = 0;
>> -    int srcu_idx;
>> -    int i;
>> +    int srcu_idx, generation;
>> +    int i = 0;
>>         if (!mem_cgroup_online(memcg))
>>           return 0;
>> -
>> +again:
>>       srcu_idx = srcu_read_lock(&shrinker_srcu);
>>       info = shrinker_info_srcu(memcg, nid);
>>       if (unlikely(!info))
>>           goto unlock;
>>   -    for_each_set_bit(i, info->map, info->map_nr_max) {
>> +    generation = atomic_read(&shrinker_srcu_generation);
>> +    for_each_set_bit_from(i, info->map, info->map_nr_max) {
>>           struct shrink_control sc = {
>>               .gfp_mask = gfp_mask,
>>               .nid = nid,
>> @@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>                   set_shrinker_bit(memcg, nid, i);
>>           }
>>           freed += ret;
>> +
>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>> +            srcu_read_unlock(&shrinker_srcu, srcu_idx);
> 
> Maybe we can add the following code here, so as to avoid repeating the
> current id and avoid triggering softlockup:
> 
>             i++;
>             cond_resched();
> 
> Thanks,
> Qi
> 
>> +            goto again;
>> +        }
>>       }
>>   unlock:
>>       srcu_read_unlock(&shrinker_srcu, srcu_idx);
>> @@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>   {
>>       unsigned long ret, freed = 0;
>>       struct shrinker *shrinker;
>> -    int srcu_idx;
>> +    int srcu_idx, generation;
>>         /*
>>        * The root memcg might be allocated even though memcg is disabled
>> @@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>           return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>         srcu_idx = srcu_read_lock(&shrinker_srcu);
>> +    generation = atomic_read(&shrinker_srcu_generation);
>>         list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>                    srcu_read_lock_held(&shrinker_srcu)) {
>> @@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>           if (ret == SHRINK_EMPTY)
>>               ret = 0;
>>           freed += ret;
>> +
>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>> +            freed = freed ? : 1;
>> +            break;
>> +        }
>>       }
>>         srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>
>>
>
Qi Zheng Feb. 25, 2023, 3:57 p.m. UTC | #15
On 2023/2/25 23:30, Kirill Tkhai wrote:
> On 25.02.2023 11:08, Qi Zheng wrote:
>>
>>
>> On 2023/2/25 05:14, Kirill Tkhai wrote:
>>> On 25.02.2023 00:02, Kirill Tkhai wrote:
>>>> On 24.02.2023 07:00, Qi Zheng wrote:
>>>>>
>>>>>
>>>>> On 2023/2/24 02:24, Sultan Alsawaf wrote:
>>>>>> On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
>>>>>>> The shrinker_rwsem is a global lock in shrinkers subsystem,
>>>>>>> it is easy to cause blocking in the following cases:
>>>>>>>
>>>>>>> a. the write lock of shrinker_rwsem was held for too long.
>>>>>>>       For example, there are many memcgs in the system, which
>>>>>>>       causes some paths to hold locks and traverse it for too
>>>>>>>       long. (e.g. expand_shrinker_info())
>>>>>>> b. the read lock of shrinker_rwsem was held for too long,
>>>>>>>       and a writer came at this time. Then this writer will be
>>>>>>>       forced to wait and block all subsequent readers.
>>>>>>>       For example:
>>>>>>>       - be scheduled when the read lock of shrinker_rwsem is
>>>>>>>         held in do_shrink_slab()
>>>>>>>       - some shrinker are blocked for too long. Like the case
>>>>>>>         mentioned in the patchset[1].
>>>>>>>
>>>>>>> Therefore, many times in history ([2],[3],[4],[5]), some
>>>>>>> people wanted to replace shrinker_rwsem reader with SRCU,
>>>>>>> but they all gave up because SRCU was not unconditionally
>>>>>>> enabled.
>>>>>>>
>>>>>>> But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
>>>>>>> the SRCU is unconditionally enabled. So it's time to use
>>>>>>> SRCU to protect readers who previously held shrinker_rwsem.
>>>>>>>
>>>>>>> [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
>>>>>>> [2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
>>>>>>> [3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
>>>>>>> [4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
>>>>>>> [5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
>>>>>>>
>>>>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>>>> ---
>>>>>>>     mm/vmscan.c | 27 +++++++++++----------------
>>>>>>>     1 file changed, 11 insertions(+), 16 deletions(-)
>>>>>>>
>>>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>>>>> index 9f895ca6216c..02987a6f95d1 100644
>>>>>>> --- a/mm/vmscan.c
>>>>>>> +++ b/mm/vmscan.c
>>>>>>> @@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>>>>>       LIST_HEAD(shrinker_list);
>>>>>>>     DECLARE_RWSEM(shrinker_rwsem);
>>>>>>> +DEFINE_SRCU(shrinker_srcu);
>>>>>>>       #ifdef CONFIG_MEMCG
>>>>>>>     static int shrinker_nr_max;
>>>>>>> @@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
>>>>>>>     void register_shrinker_prepared(struct shrinker *shrinker)
>>>>>>>     {
>>>>>>>         down_write(&shrinker_rwsem);
>>>>>>> -    list_add_tail(&shrinker->list, &shrinker_list);
>>>>>>> +    list_add_tail_rcu(&shrinker->list, &shrinker_list);
>>>>>>>         shrinker->flags |= SHRINKER_REGISTERED;
>>>>>>>         shrinker_debugfs_add(shrinker);
>>>>>>>         up_write(&shrinker_rwsem);
>>>>>>> @@ -760,13 +761,15 @@ void unregister_shrinker(struct shrinker *shrinker)
>>>>>>>             return;
>>>>>>>           down_write(&shrinker_rwsem);
>>>>>>> -    list_del(&shrinker->list);
>>>>>>> +    list_del_rcu(&shrinker->list);
>>>>>>>         shrinker->flags &= ~SHRINKER_REGISTERED;
>>>>>>>         if (shrinker->flags & SHRINKER_MEMCG_AWARE)
>>>>>>>             unregister_memcg_shrinker(shrinker);
>>>>>>>         debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>>>>>         up_write(&shrinker_rwsem);
>>>>>>>     +    synchronize_srcu(&shrinker_srcu);
>>>>>>> +
>>>>>>>         debugfs_remove_recursive(debugfs_entry);
>>>>>>>           kfree(shrinker->nr_deferred);
>>>>>>> @@ -786,6 +789,7 @@ void synchronize_shrinkers(void)
>>>>>>>     {
>>>>>>>         down_write(&shrinker_rwsem);
>>>>>>>         up_write(&shrinker_rwsem);
>>>>>>> +    synchronize_srcu(&shrinker_srcu);
>>>>>>>     }
>>>>>>>     EXPORT_SYMBOL(synchronize_shrinkers);
>>>>>>>     @@ -996,6 +1000,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>>     {
>>>>>>>         unsigned long ret, freed = 0;
>>>>>>>         struct shrinker *shrinker;
>>>>>>> +    int srcu_idx;
>>>>>>>           /*
>>>>>>>          * The root memcg might be allocated even though memcg is disabled
>>>>>>> @@ -1007,10 +1012,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>>         if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>>>>>>>             return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>>>>>     -    if (!down_read_trylock(&shrinker_rwsem))
>>>>>>> -        goto out;
>>>>>>> +    srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>>>>     -    list_for_each_entry(shrinker, &shrinker_list, list) {
>>>>>>> +    list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>>>>> +                 srcu_read_lock_held(&shrinker_srcu)) {
>>>>>>>             struct shrink_control sc = {
>>>>>>>                 .gfp_mask = gfp_mask,
>>>>>>>                 .nid = nid,
>>>>>>> @@ -1021,19 +1026,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>>             if (ret == SHRINK_EMPTY)
>>>>>>>                 ret = 0;
>>>>>>>             freed += ret;
>>>>>>> -        /*
>>>>>>> -         * Bail out if someone want to register a new shrinker to
>>>>>>> -         * prevent the registration from being stalled for long periods
>>>>>>> -         * by parallel ongoing shrinking.
>>>>>>> -         */
>>>>>>> -        if (rwsem_is_contended(&shrinker_rwsem)) {
>>>>>>> -            freed = freed ? : 1;
>>>>>>> -            break;
>>>>>>> -        }
>>>>>>>         }
>>>>>>>     -    up_read(&shrinker_rwsem);
>>>>>>> -out:
>>>>>>> +    srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>>>>         cond_resched();
>>>>>>>         return freed;
>>>>>>>     }
>>>>>>> -- 
>>>>>>> 2.20.1
>>>>>>>
>>>>>>>
>>>>>>
>>>>>> Hi Qi,
>>>>>>
>>>>>> A different problem I realized after my old attempt to use SRCU was that the
>>>>>> unregister_shrinker() path became quite slow due to the heavy synchronize_srcu()
>>>>>> call. Both register_shrinker() *and* unregister_shrinker() are called frequently
>>>>>> these days, and SRCU is too unfair to the unregister path IMO.
>>>>>
>>>>> Hi Sultan,
>>>>>
>>>>> IIUC, for unregister_shrinker(), the wait time is hardly longer with
>>>>> SRCU than with shrinker_rwsem before.
>>>>>
>>>>> And I just did a simple test. After using the script in cover letter to
>>>>> increase the shrink_slab hotspot, I did umount 1k times at the same
>>>>> time, and then I used bpftrace to measure the time consumption of
>>>>> unregister_shrinker() as follows:
>>>>>
>>>>> bpftrace -e 'kprobe:unregister_shrinker { @start[tid] = nsecs; } kretprobe:unregister_shrinker /@start[tid]/ { @ns[comm] = hist(nsecs - @start[tid]); delete(@start[tid]); }'
>>>>>
>>>>> @ns[umount]:
>>>>> [16K, 32K)             3 |      |
>>>>> [32K, 64K)            66 |@@@@@@@@@@      |
>>>>> [64K, 128K)           32 |@@@@@      |
>>>>> [128K, 256K)          22 |@@@      |
>>>>> [256K, 512K)          48 |@@@@@@@      |
>>>>> [512K, 1M)            19 |@@@      |
>>>>> [1M, 2M)             131 |@@@@@@@@@@@@@@@@@@@@@      |
>>>>> [2M, 4M)             313 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>>>>> [4M, 8M)             302 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  |
>>>>> [8M, 16M)             55 |@@@@@@@@@
>>>>>
>>>>> I see that the highest time-consuming of unregister_shrinker() is between 8ms and 16ms, which feels tolerable?
>>
>> Hi Kirill,
>>
>>>>
>>>> The fundamental difference is that before the patchset this for_each_set_bit() iteration could be broken in the middle
>>>> of two do_shrink_slab() calls, while after the patchset we can leave for_each_set_bit() only after visiting all set bits.
>>
>> After looking at the git log[1], I saw that we originally introduced
>> rwsem_is_contendent() here to aviod blocking register_shrinker(),
>> not unregister_shrinker().
>>
>> So I am curious, do we really care about the speed of
>> unregister_shrinker()?
> 
> My opinion is that for general reasons we should avoid long unbreakable actions in kernel. Especially when they may be called
> synchronous from userspace.

Got it.

And maybe you missed the previous comments below.

> 
> We even have this as generic rule. See check_hung_task().
> 
> Before, the longest sleep in unregister_shrinker() was a sleep waiting for single longest do_shrink_slab().
> 
> After the patch the longest sleep will be a sleep waiting for all do_shrink_slab() calls (all set bits in shrinker_info).
> 
>> And after using SRCU, register_shrinker() will not be blocked by slab
>> shrink at all.
>>
>> [1]. https://github.com/torvalds/linux/commit/e496612
>>
>>>>
>>>> Using only synchronize_srcu_expedited() won't help here.
>>>>
>>>> My opinion is we should restore a check similar to the rwsem_is_contendent() check that we had before. Something like
>>
>> If we really care about the speed of unregister_shrinker() like
>> register_shrinker(), I think this is a good idea. This guarantees
>> at least the speed of the unregister_shrinker() is not deteriorated. :)
>>
>>>> the below on top of your patchset merged into appropriate patch:
>>>>
>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>> index 27ef9946ae8a..50e7812468ec 100644
>>>> --- a/mm/vmscan.c
>>>> +++ b/mm/vmscan.c
>>>> @@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>>    LIST_HEAD(shrinker_list);
>>>>    DEFINE_MUTEX(shrinker_mutex);
>>>>    DEFINE_SRCU(shrinker_srcu);
>>>> +static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
>>>>      #ifdef CONFIG_MEMCG
>>>>    static int shrinker_nr_max;
>>>> @@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
>>>>        debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>>        mutex_unlock(&shrinker_mutex);
>>>>    +    atomic_inc(&shrinker_srcu_generation);
>>>>        synchronize_srcu(&shrinker_srcu);
>>>>          debugfs_remove_recursive(debugfs_entry);
>>>> @@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
>>>>     */
>>>>    void synchronize_shrinkers(void)
>>>>    {
>>>> +    atomic_inc(&shrinker_srcu_generation);
>>>>        synchronize_srcu(&shrinker_srcu);
>>>>    }
>>>>    EXPORT_SYMBOL(synchronize_shrinkers);
>>>> @@ -908,7 +911,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>    {
>>>>        struct shrinker_info *info;
>>>>        unsigned long ret, freed = 0;
>>>> -    int srcu_idx;
>>>> +    int srcu_idx, generation;
>>>>        int i;
>>>>          if (!mem_cgroup_online(memcg))
>>>> @@ -919,6 +922,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>        if (unlikely(!info))
>>>>            goto unlock;
>>>>    +    generation = atomic_read(&shrinker_srcu_generation);
>>>>        for_each_set_bit(i, info->map, info->map_nr_max) {
>>>>            struct shrink_control sc = {
>>>>                .gfp_mask = gfp_mask,
>>>> @@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>                    set_shrinker_bit(memcg, nid, i);
>>>>            }
>>>>            freed += ret;
>>>> +
>>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>>> +            freed = freed ? : 1;
>>>> +            break;
>>>> +        }
>>>>        }
>>>>    unlock:
>>>>        srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>> @@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>    {
>>>>        unsigned long ret, freed = 0;
>>>>        struct shrinker *shrinker;
>>>> -    int srcu_idx;
>>>> +    int srcu_idx, generation;
>>>>          /*
>>>>         * The root memcg might be allocated even though memcg is disabled
>>>> @@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>            return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>>          srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>> +    generation = atomic_read(&shrinker_srcu_generation);
>>>>          list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>>                     srcu_read_lock_held(&shrinker_srcu)) {
>>>> @@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>            if (ret == SHRINK_EMPTY)
>>>>                ret = 0;
>>>>            freed += ret;
>>>> +
>>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>>> +            freed = freed ? : 1;
>>>> +            break;
>>>> +        }
>>>>        }
>>>>          srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>
>>> Even more, for memcg shrinkers we may unlock SRCU and continue iterations from the same shrinker id:
>>
>> Maybe we can also do this for global slab shrink? Like below:

How about this?

>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index ffddbd204259..9d8c53075298 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -1012,7 +1012,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>                                   int priority)
>>   {
>>          unsigned long ret, freed = 0;
>> -       struct shrinker *shrinker;
>> +       struct shrinker *shrinker = NULL;
>>          int srcu_idx, generation;
>>
>>          /*
>> @@ -1025,11 +1025,15 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>          if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>>                  return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>
>> +again:
>>          srcu_idx = srcu_read_lock(&shrinker_srcu);
>>
>>          generation = atomic_read(&shrinker_srcu_generation);
>> -       list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>> -                                srcu_read_lock_held(&shrinker_srcu)) {
>> +       if (!shrinker)
>> +               shrinker = list_entry_rcu(shrinker_list.next, struct shrinker, list);
>> +       else
>> +               shrinker = list_entry_rcu(shrinker->list.next, struct shrinker, list);
>> +       list_for_each_entry_from_rcu(shrinker, &shrinker_list, list) {
>>                  struct shrink_control sc = {
>>                          .gfp_mask = gfp_mask,
>>                          .nid = nid,
>> @@ -1042,8 +1046,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>                  freed += ret;
>>
>>                  if (atomic_read(&shrinker_srcu_generation) != generation) {
>> -                       freed = freed ? : 1;
>> -                       break;
>> +                       srcu_read_unlock(&shrinker_srcu, srcu_idx);
>> +                       cond_resched();
>> +                       goto again;
>>                  }
>>          }
>>
>>>
>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>> index 27ef9946ae8a..0b197bba1257 100644
>>> --- a/mm/vmscan.c
>>> +++ b/mm/vmscan.c
>>> @@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>    LIST_HEAD(shrinker_list);
>>>    DEFINE_MUTEX(shrinker_mutex);
>>>    DEFINE_SRCU(shrinker_srcu);
>>> +static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
>>>      #ifdef CONFIG_MEMCG
>>>    static int shrinker_nr_max;
>>> @@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
>>>        debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>        mutex_unlock(&shrinker_mutex);
>>>    +    atomic_inc(&shrinker_srcu_generation);
>>>        synchronize_srcu(&shrinker_srcu);
>>>          debugfs_remove_recursive(debugfs_entry);
>>> @@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
>>>     */
>>>    void synchronize_shrinkers(void)
>>>    {
>>> +    atomic_inc(&shrinker_srcu_generation);
>>>        synchronize_srcu(&shrinker_srcu);
>>>    }
>>>    EXPORT_SYMBOL(synchronize_shrinkers);
>>> @@ -908,18 +911,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>    {
>>>        struct shrinker_info *info;
>>>        unsigned long ret, freed = 0;
>>> -    int srcu_idx;
>>> -    int i;
>>> +    int srcu_idx, generation;
>>> +    int i = 0;
>>>          if (!mem_cgroup_online(memcg))
>>>            return 0;
>>> -
>>> +again:
>>>        srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>        info = shrinker_info_srcu(memcg, nid);
>>>        if (unlikely(!info))
>>>            goto unlock;
>>>    -    for_each_set_bit(i, info->map, info->map_nr_max) {
>>> +    generation = atomic_read(&shrinker_srcu_generation);
>>> +    for_each_set_bit_from(i, info->map, info->map_nr_max) {
>>>            struct shrink_control sc = {
>>>                .gfp_mask = gfp_mask,
>>>                .nid = nid,
>>> @@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>                    set_shrinker_bit(memcg, nid, i);
>>>            }
>>>            freed += ret;
>>> +
>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>> +            srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>
>> Maybe we can add the following code here, so as to avoid repeating the
>> current id and avoid triggering softlockup:
>>
>>              i++;
>>              cond_resched();

And this. :)

Thanks,
Qi

>>
>> Thanks,
>> Qi
>>
>>> +            goto again;
>>> +        }
>>>        }
>>>    unlock:
>>>        srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>> @@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>    {
>>>        unsigned long ret, freed = 0;
>>>        struct shrinker *shrinker;
>>> -    int srcu_idx;
>>> +    int srcu_idx, generation;
>>>          /*
>>>         * The root memcg might be allocated even though memcg is disabled
>>> @@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>            return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>          srcu_idx = srcu_read_lock(&shrinker_srcu);
>>> +    generation = atomic_read(&shrinker_srcu_generation);
>>>          list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>                     srcu_read_lock_held(&shrinker_srcu)) {
>>> @@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>            if (ret == SHRINK_EMPTY)
>>>                ret = 0;
>>>            freed += ret;
>>> +
>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>> +            freed = freed ? : 1;
>>> +            break;
>>> +        }
>>>        }
>>>          srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>
>>>
>>
>
Kirill Tkhai Feb. 25, 2023, 4:17 p.m. UTC | #16
On 25.02.2023 18:57, Qi Zheng wrote:
> 
> 
> On 2023/2/25 23:30, Kirill Tkhai wrote:
>> On 25.02.2023 11:08, Qi Zheng wrote:
>>>
>>>
>>> On 2023/2/25 05:14, Kirill Tkhai wrote:
>>>> On 25.02.2023 00:02, Kirill Tkhai wrote:
>>>>> On 24.02.2023 07:00, Qi Zheng wrote:
>>>>>>
>>>>>>
>>>>>> On 2023/2/24 02:24, Sultan Alsawaf wrote:
>>>>>>> On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
>>>>>>>> The shrinker_rwsem is a global lock in shrinkers subsystem,
>>>>>>>> it is easy to cause blocking in the following cases:
>>>>>>>>
>>>>>>>> a. the write lock of shrinker_rwsem was held for too long.
>>>>>>>>       For example, there are many memcgs in the system, which
>>>>>>>>       causes some paths to hold locks and traverse it for too
>>>>>>>>       long. (e.g. expand_shrinker_info())
>>>>>>>> b. the read lock of shrinker_rwsem was held for too long,
>>>>>>>>       and a writer came at this time. Then this writer will be
>>>>>>>>       forced to wait and block all subsequent readers.
>>>>>>>>       For example:
>>>>>>>>       - be scheduled when the read lock of shrinker_rwsem is
>>>>>>>>         held in do_shrink_slab()
>>>>>>>>       - some shrinker are blocked for too long. Like the case
>>>>>>>>         mentioned in the patchset[1].
>>>>>>>>
>>>>>>>> Therefore, many times in history ([2],[3],[4],[5]), some
>>>>>>>> people wanted to replace shrinker_rwsem reader with SRCU,
>>>>>>>> but they all gave up because SRCU was not unconditionally
>>>>>>>> enabled.
>>>>>>>>
>>>>>>>> But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
>>>>>>>> the SRCU is unconditionally enabled. So it's time to use
>>>>>>>> SRCU to protect readers who previously held shrinker_rwsem.
>>>>>>>>
>>>>>>>> [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
>>>>>>>> [2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
>>>>>>>> [3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
>>>>>>>> [4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
>>>>>>>> [5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/
>>>>>>>>
>>>>>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>>>>> ---
>>>>>>>>     mm/vmscan.c | 27 +++++++++++----------------
>>>>>>>>     1 file changed, 11 insertions(+), 16 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>>>>>> index 9f895ca6216c..02987a6f95d1 100644
>>>>>>>> --- a/mm/vmscan.c
>>>>>>>> +++ b/mm/vmscan.c
>>>>>>>> @@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>>>>>>       LIST_HEAD(shrinker_list);
>>>>>>>>     DECLARE_RWSEM(shrinker_rwsem);
>>>>>>>> +DEFINE_SRCU(shrinker_srcu);
>>>>>>>>       #ifdef CONFIG_MEMCG
>>>>>>>>     static int shrinker_nr_max;
>>>>>>>> @@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
>>>>>>>>     void register_shrinker_prepared(struct shrinker *shrinker)
>>>>>>>>     {
>>>>>>>>         down_write(&shrinker_rwsem);
>>>>>>>> -    list_add_tail(&shrinker->list, &shrinker_list);
>>>>>>>> +    list_add_tail_rcu(&shrinker->list, &shrinker_list);
>>>>>>>>         shrinker->flags |= SHRINKER_REGISTERED;
>>>>>>>>         shrinker_debugfs_add(shrinker);
>>>>>>>>         up_write(&shrinker_rwsem);
>>>>>>>> @@ -760,13 +761,15 @@ void unregister_shrinker(struct shrinker *shrinker)
>>>>>>>>             return;
>>>>>>>>           down_write(&shrinker_rwsem);
>>>>>>>> -    list_del(&shrinker->list);
>>>>>>>> +    list_del_rcu(&shrinker->list);
>>>>>>>>         shrinker->flags &= ~SHRINKER_REGISTERED;
>>>>>>>>         if (shrinker->flags & SHRINKER_MEMCG_AWARE)
>>>>>>>>             unregister_memcg_shrinker(shrinker);
>>>>>>>>         debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>>>>>>         up_write(&shrinker_rwsem);
>>>>>>>>     +    synchronize_srcu(&shrinker_srcu);
>>>>>>>> +
>>>>>>>>         debugfs_remove_recursive(debugfs_entry);
>>>>>>>>           kfree(shrinker->nr_deferred);
>>>>>>>> @@ -786,6 +789,7 @@ void synchronize_shrinkers(void)
>>>>>>>>     {
>>>>>>>>         down_write(&shrinker_rwsem);
>>>>>>>>         up_write(&shrinker_rwsem);
>>>>>>>> +    synchronize_srcu(&shrinker_srcu);
>>>>>>>>     }
>>>>>>>>     EXPORT_SYMBOL(synchronize_shrinkers);
>>>>>>>>     @@ -996,6 +1000,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>>>     {
>>>>>>>>         unsigned long ret, freed = 0;
>>>>>>>>         struct shrinker *shrinker;
>>>>>>>> +    int srcu_idx;
>>>>>>>>           /*
>>>>>>>>          * The root memcg might be allocated even though memcg is disabled
>>>>>>>> @@ -1007,10 +1012,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>>>         if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>>>>>>>>             return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>>>>>>     -    if (!down_read_trylock(&shrinker_rwsem))
>>>>>>>> -        goto out;
>>>>>>>> +    srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>>>>>     -    list_for_each_entry(shrinker, &shrinker_list, list) {
>>>>>>>> +    list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>>>>>> +                 srcu_read_lock_held(&shrinker_srcu)) {
>>>>>>>>             struct shrink_control sc = {
>>>>>>>>                 .gfp_mask = gfp_mask,
>>>>>>>>                 .nid = nid,
>>>>>>>> @@ -1021,19 +1026,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>>>             if (ret == SHRINK_EMPTY)
>>>>>>>>                 ret = 0;
>>>>>>>>             freed += ret;
>>>>>>>> -        /*
>>>>>>>> -         * Bail out if someone want to register a new shrinker to
>>>>>>>> -         * prevent the registration from being stalled for long periods
>>>>>>>> -         * by parallel ongoing shrinking.
>>>>>>>> -         */
>>>>>>>> -        if (rwsem_is_contended(&shrinker_rwsem)) {
>>>>>>>> -            freed = freed ? : 1;
>>>>>>>> -            break;
>>>>>>>> -        }
>>>>>>>>         }
>>>>>>>>     -    up_read(&shrinker_rwsem);
>>>>>>>> -out:
>>>>>>>> +    srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>>>>>         cond_resched();
>>>>>>>>         return freed;
>>>>>>>>     }
>>>>>>>> -- 
>>>>>>>> 2.20.1
>>>>>>>>
>>>>>>>>
>>>>>>>
>>>>>>> Hi Qi,
>>>>>>>
>>>>>>> A different problem I realized after my old attempt to use SRCU was that the
>>>>>>> unregister_shrinker() path became quite slow due to the heavy synchronize_srcu()
>>>>>>> call. Both register_shrinker() *and* unregister_shrinker() are called frequently
>>>>>>> these days, and SRCU is too unfair to the unregister path IMO.
>>>>>>
>>>>>> Hi Sultan,
>>>>>>
>>>>>> IIUC, for unregister_shrinker(), the wait time is hardly longer with
>>>>>> SRCU than with shrinker_rwsem before.
>>>>>>
>>>>>> And I just did a simple test. After using the script in cover letter to
>>>>>> increase the shrink_slab hotspot, I did umount 1k times at the same
>>>>>> time, and then I used bpftrace to measure the time consumption of
>>>>>> unregister_shrinker() as follows:
>>>>>>
>>>>>> bpftrace -e 'kprobe:unregister_shrinker { @start[tid] = nsecs; } kretprobe:unregister_shrinker /@start[tid]/ { @ns[comm] = hist(nsecs - @start[tid]); delete(@start[tid]); }'
>>>>>>
>>>>>> @ns[umount]:
>>>>>> [16K, 32K)             3 |      |
>>>>>> [32K, 64K)            66 |@@@@@@@@@@      |
>>>>>> [64K, 128K)           32 |@@@@@      |
>>>>>> [128K, 256K)          22 |@@@      |
>>>>>> [256K, 512K)          48 |@@@@@@@      |
>>>>>> [512K, 1M)            19 |@@@      |
>>>>>> [1M, 2M)             131 |@@@@@@@@@@@@@@@@@@@@@      |
>>>>>> [2M, 4M)             313 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>>>>>> [4M, 8M)             302 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  |
>>>>>> [8M, 16M)             55 |@@@@@@@@@
>>>>>>
>>>>>> I see that the highest time-consuming of unregister_shrinker() is between 8ms and 16ms, which feels tolerable?
>>>
>>> Hi Kirill,
>>>
>>>>>
>>>>> The fundamental difference is that before the patchset this for_each_set_bit() iteration could be broken in the middle
>>>>> of two do_shrink_slab() calls, while after the patchset we can leave for_each_set_bit() only after visiting all set bits.
>>>
>>> After looking at the git log[1], I saw that we originally introduced
>>> rwsem_is_contendent() here to aviod blocking register_shrinker(),
>>> not unregister_shrinker().
>>>
>>> So I am curious, do we really care about the speed of
>>> unregister_shrinker()?
>>
>> My opinion is that for general reasons we should avoid long unbreakable actions in kernel. Especially when they may be called
>> synchronous from userspace.
> 
> Got it.
> 
> And maybe you missed the previous comments below.

Oh, I really missed them!

>>
>> We even have this as generic rule. See check_hung_task().
>>
>> Before, the longest sleep in unregister_shrinker() was a sleep waiting for single longest do_shrink_slab().
>>
>> After the patch the longest sleep will be a sleep waiting for all do_shrink_slab() calls (all set bits in shrinker_info).
>>
>>> And after using SRCU, register_shrinker() will not be blocked by slab
>>> shrink at all.
>>>
>>> [1]. https://github.com/torvalds/linux/commit/e496612
>>>
>>>>>
>>>>> Using only synchronize_srcu_expedited() won't help here.
>>>>>
>>>>> My opinion is we should restore a check similar to the rwsem_is_contendent() check that we had before. Something like
>>>
>>> If we really care about the speed of unregister_shrinker() like
>>> register_shrinker(), I think this is a good idea. This guarantees
>>> at least the speed of the unregister_shrinker() is not deteriorated. :)
>>>
>>>>> the below on top of your patchset merged into appropriate patch:
>>>>>
>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>>> index 27ef9946ae8a..50e7812468ec 100644
>>>>> --- a/mm/vmscan.c
>>>>> +++ b/mm/vmscan.c
>>>>> @@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>>>    LIST_HEAD(shrinker_list);
>>>>>    DEFINE_MUTEX(shrinker_mutex);
>>>>>    DEFINE_SRCU(shrinker_srcu);
>>>>> +static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
>>>>>      #ifdef CONFIG_MEMCG
>>>>>    static int shrinker_nr_max;
>>>>> @@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
>>>>>        debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>>>        mutex_unlock(&shrinker_mutex);
>>>>>    +    atomic_inc(&shrinker_srcu_generation);
>>>>>        synchronize_srcu(&shrinker_srcu);
>>>>>          debugfs_remove_recursive(debugfs_entry);
>>>>> @@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
>>>>>     */
>>>>>    void synchronize_shrinkers(void)
>>>>>    {
>>>>> +    atomic_inc(&shrinker_srcu_generation);
>>>>>        synchronize_srcu(&shrinker_srcu);
>>>>>    }
>>>>>    EXPORT_SYMBOL(synchronize_shrinkers);
>>>>> @@ -908,7 +911,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>>    {
>>>>>        struct shrinker_info *info;
>>>>>        unsigned long ret, freed = 0;
>>>>> -    int srcu_idx;
>>>>> +    int srcu_idx, generation;
>>>>>        int i;
>>>>>          if (!mem_cgroup_online(memcg))
>>>>> @@ -919,6 +922,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>>        if (unlikely(!info))
>>>>>            goto unlock;
>>>>>    +    generation = atomic_read(&shrinker_srcu_generation);
>>>>>        for_each_set_bit(i, info->map, info->map_nr_max) {
>>>>>            struct shrink_control sc = {
>>>>>                .gfp_mask = gfp_mask,
>>>>> @@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>>                    set_shrinker_bit(memcg, nid, i);
>>>>>            }
>>>>>            freed += ret;
>>>>> +
>>>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>>>> +            freed = freed ? : 1;
>>>>> +            break;
>>>>> +        }
>>>>>        }
>>>>>    unlock:
>>>>>        srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>> @@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>    {
>>>>>        unsigned long ret, freed = 0;
>>>>>        struct shrinker *shrinker;
>>>>> -    int srcu_idx;
>>>>> +    int srcu_idx, generation;
>>>>>          /*
>>>>>         * The root memcg might be allocated even though memcg is disabled
>>>>> @@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>            return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>>>          srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>> +    generation = atomic_read(&shrinker_srcu_generation);
>>>>>          list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>>>                     srcu_read_lock_held(&shrinker_srcu)) {
>>>>> @@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>            if (ret == SHRINK_EMPTY)
>>>>>                ret = 0;
>>>>>            freed += ret;
>>>>> +
>>>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>>>> +            freed = freed ? : 1;
>>>>> +            break;
>>>>> +        }
>>>>>        }
>>>>>          srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>
>>>> Even more, for memcg shrinkers we may unlock SRCU and continue iterations from the same shrinker id:
>>>
>>> Maybe we can also do this for global slab shrink? Like below:
> 
> How about this?
>>>
>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>> index ffddbd204259..9d8c53075298 100644
>>> --- a/mm/vmscan.c
>>> +++ b/mm/vmscan.c
>>> @@ -1012,7 +1012,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>                                   int priority)
>>>   {
>>>          unsigned long ret, freed = 0;
>>> -       struct shrinker *shrinker;
>>> +       struct shrinker *shrinker = NULL;
>>>          int srcu_idx, generation;
>>>
>>>          /*
>>> @@ -1025,11 +1025,15 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>          if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>>>                  return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>
>>> +again:
>>>          srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>
>>>          generation = atomic_read(&shrinker_srcu_generation);
>>> -       list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>> -                                srcu_read_lock_held(&shrinker_srcu)) {
>>> +       if (!shrinker)
>>> +               shrinker = list_entry_rcu(shrinker_list.next, struct shrinker, list);
>>> +       else
>>> +               shrinker = list_entry_rcu(shrinker->list.next, struct shrinker, list);
>>> +       list_for_each_entry_from_rcu(shrinker, &shrinker_list, list) {
>>>                  struct shrink_control sc = {
>>>                          .gfp_mask = gfp_mask,
>>>                          .nid = nid,
>>> @@ -1042,8 +1046,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>                  freed += ret;
>>>
>>>                  if (atomic_read(&shrinker_srcu_generation) != generation) {
>>> -                       freed = freed ? : 1;
>>> -                       break;
>>> +                       srcu_read_unlock(&shrinker_srcu, srcu_idx);

After SRCU in unlocked we can't believe @shrinker anymore. So, above list_entry_rcu(shrinker->list.next)
dereferences some random memory.

>>> +                       cond_resched();
>>> +                       goto again;
>>>                  }
>>>          }
>>>
>>>>
>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>> index 27ef9946ae8a..0b197bba1257 100644
>>>> --- a/mm/vmscan.c
>>>> +++ b/mm/vmscan.c
>>>> @@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>>    LIST_HEAD(shrinker_list);
>>>>    DEFINE_MUTEX(shrinker_mutex);
>>>>    DEFINE_SRCU(shrinker_srcu);
>>>> +static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
>>>>      #ifdef CONFIG_MEMCG
>>>>    static int shrinker_nr_max;
>>>> @@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
>>>>        debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>>        mutex_unlock(&shrinker_mutex);
>>>>    +    atomic_inc(&shrinker_srcu_generation);
>>>>        synchronize_srcu(&shrinker_srcu);
>>>>          debugfs_remove_recursive(debugfs_entry);
>>>> @@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
>>>>     */
>>>>    void synchronize_shrinkers(void)
>>>>    {
>>>> +    atomic_inc(&shrinker_srcu_generation);
>>>>        synchronize_srcu(&shrinker_srcu);
>>>>    }
>>>>    EXPORT_SYMBOL(synchronize_shrinkers);
>>>> @@ -908,18 +911,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>    {
>>>>        struct shrinker_info *info;
>>>>        unsigned long ret, freed = 0;
>>>> -    int srcu_idx;
>>>> -    int i;
>>>> +    int srcu_idx, generation;
>>>> +    int i = 0;
>>>>          if (!mem_cgroup_online(memcg))
>>>>            return 0;
>>>> -
>>>> +again:
>>>>        srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>        info = shrinker_info_srcu(memcg, nid);
>>>>        if (unlikely(!info))
>>>>            goto unlock;
>>>>    -    for_each_set_bit(i, info->map, info->map_nr_max) {
>>>> +    generation = atomic_read(&shrinker_srcu_generation);
>>>> +    for_each_set_bit_from(i, info->map, info->map_nr_max) {
>>>>            struct shrink_control sc = {
>>>>                .gfp_mask = gfp_mask,
>>>>                .nid = nid,
>>>> @@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>                    set_shrinker_bit(memcg, nid, i);
>>>>            }
>>>>            freed += ret;
>>>> +
>>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>>> +            srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>
>>> Maybe we can add the following code here, so as to avoid repeating the
>>> current id and avoid triggering softlockup:
>>>
>>>              i++;

This is OK.

>>>              cond_resched();

Possible, existing cond_resched() in do_shrink_slab() is enough.

> And this. :)
> 
> Thanks,
> Qi
> 
>>>
>>> Thanks,
>>> Qi
>>>
>>>> +            goto again;
>>>> +        }
>>>>        }
>>>>    unlock:
>>>>        srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>> @@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>    {
>>>>        unsigned long ret, freed = 0;
>>>>        struct shrinker *shrinker;
>>>> -    int srcu_idx;
>>>> +    int srcu_idx, generation;
>>>>          /*
>>>>         * The root memcg might be allocated even though memcg is disabled
>>>> @@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>            return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>>          srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>> +    generation = atomic_read(&shrinker_srcu_generation);
>>>>          list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>>                     srcu_read_lock_held(&shrinker_srcu)) {
>>>> @@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>            if (ret == SHRINK_EMPTY)
>>>>                ret = 0;
>>>>            freed += ret;
>>>> +
>>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>>> +            freed = freed ? : 1;
>>>> +            break;
>>>> +        }
>>>>        }
>>>>          srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>
>>>>
>>>
>>
>
Qi Zheng Feb. 25, 2023, 4:37 p.m. UTC | #17
On 2023/2/26 00:17, Kirill Tkhai wrote:
> On 25.02.2023 18:57, Qi Zheng wrote:
>>
<...>
>> How about this?
>>>>
>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>> index ffddbd204259..9d8c53075298 100644
>>>> --- a/mm/vmscan.c
>>>> +++ b/mm/vmscan.c
>>>> @@ -1012,7 +1012,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>                                    int priority)
>>>>    {
>>>>           unsigned long ret, freed = 0;
>>>> -       struct shrinker *shrinker;
>>>> +       struct shrinker *shrinker = NULL;
>>>>           int srcu_idx, generation;
>>>>
>>>>           /*
>>>> @@ -1025,11 +1025,15 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>           if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>>>>                   return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>>
>>>> +again:
>>>>           srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>
>>>>           generation = atomic_read(&shrinker_srcu_generation);
>>>> -       list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>> -                                srcu_read_lock_held(&shrinker_srcu)) {
>>>> +       if (!shrinker)
>>>> +               shrinker = list_entry_rcu(shrinker_list.next, struct shrinker, list);
>>>> +       else
>>>> +               shrinker = list_entry_rcu(shrinker->list.next, struct shrinker, list);
>>>> +       list_for_each_entry_from_rcu(shrinker, &shrinker_list, list) {
>>>>                   struct shrink_control sc = {
>>>>                           .gfp_mask = gfp_mask,
>>>>                           .nid = nid,
>>>> @@ -1042,8 +1046,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>                   freed += ret;
>>>>
>>>>                   if (atomic_read(&shrinker_srcu_generation) != generation) {
>>>> -                       freed = freed ? : 1;
>>>> -                       break;
>>>> +                       srcu_read_unlock(&shrinker_srcu, srcu_idx);
> 
> After SRCU in unlocked we can't believe @shrinker anymore. So, above list_entry_rcu(shrinker->list.next)
> dereferences some random memory.

Indeed.

> 
>>>> +                       cond_resched();
>>>> +                       goto again;
>>>>                   }
>>>>           }
>>>>
>>>>>
>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>>> index 27ef9946ae8a..0b197bba1257 100644
>>>>> --- a/mm/vmscan.c
>>>>> +++ b/mm/vmscan.c
>>>>> @@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>>>     LIST_HEAD(shrinker_list);
>>>>>     DEFINE_MUTEX(shrinker_mutex);
>>>>>     DEFINE_SRCU(shrinker_srcu);
>>>>> +static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
>>>>>       #ifdef CONFIG_MEMCG
>>>>>     static int shrinker_nr_max;
>>>>> @@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
>>>>>         debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>>>         mutex_unlock(&shrinker_mutex);
>>>>>     +    atomic_inc(&shrinker_srcu_generation);
>>>>>         synchronize_srcu(&shrinker_srcu);
>>>>>           debugfs_remove_recursive(debugfs_entry);
>>>>> @@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
>>>>>      */
>>>>>     void synchronize_shrinkers(void)
>>>>>     {
>>>>> +    atomic_inc(&shrinker_srcu_generation);
>>>>>         synchronize_srcu(&shrinker_srcu);
>>>>>     }
>>>>>     EXPORT_SYMBOL(synchronize_shrinkers);
>>>>> @@ -908,18 +911,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>>     {
>>>>>         struct shrinker_info *info;
>>>>>         unsigned long ret, freed = 0;
>>>>> -    int srcu_idx;
>>>>> -    int i;
>>>>> +    int srcu_idx, generation;
>>>>> +    int i = 0;
>>>>>           if (!mem_cgroup_online(memcg))
>>>>>             return 0;
>>>>> -
>>>>> +again:
>>>>>         srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>>         info = shrinker_info_srcu(memcg, nid);
>>>>>         if (unlikely(!info))
>>>>>             goto unlock;
>>>>>     -    for_each_set_bit(i, info->map, info->map_nr_max) {
>>>>> +    generation = atomic_read(&shrinker_srcu_generation);
>>>>> +    for_each_set_bit_from(i, info->map, info->map_nr_max) {
>>>>>             struct shrink_control sc = {
>>>>>                 .gfp_mask = gfp_mask,
>>>>>                 .nid = nid,
>>>>> @@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>>                     set_shrinker_bit(memcg, nid, i);
>>>>>             }
>>>>>             freed += ret;
>>>>> +
>>>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>>>> +            srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>
>>>> Maybe we can add the following code here, so as to avoid repeating the
>>>> current id and avoid triggering softlockup:
>>>>
>>>>               i++;
> 
> This is OK.
> 
>>>>               cond_resched();
> 
> Possible, existing cond_resched() in do_shrink_slab() is enough.

Yeah.

I will add this patch in the next version. May I mark you as the author
of this patch?

Thanks,
Qi

> 
>> And this. :)
>>
>> Thanks,
>> Qi
>>
>>>>
>>>> Thanks,
>>>> Qi
>>>>
>>>>> +            goto again;
>>>>> +        }
>>>>>         }
>>>>>     unlock:
>>>>>         srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>> @@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>     {
>>>>>         unsigned long ret, freed = 0;
>>>>>         struct shrinker *shrinker;
>>>>> -    int srcu_idx;
>>>>> +    int srcu_idx, generation;
>>>>>           /*
>>>>>          * The root memcg might be allocated even though memcg is disabled
>>>>> @@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>             return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>>>           srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>> +    generation = atomic_read(&shrinker_srcu_generation);
>>>>>           list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>>>                      srcu_read_lock_held(&shrinker_srcu)) {
>>>>> @@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>             if (ret == SHRINK_EMPTY)
>>>>>                 ret = 0;
>>>>>             freed += ret;
>>>>> +
>>>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>>>> +            freed = freed ? : 1;
>>>>> +            break;
>>>>> +        }
>>>>>         }
>>>>>           srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>>
>>>>>
>>>>
>>>
>>
>
Kirill Tkhai Feb. 25, 2023, 9:28 p.m. UTC | #18
On 25.02.2023 19:37, Qi Zheng wrote:
> 
> 
> On 2023/2/26 00:17, Kirill Tkhai wrote:
>> On 25.02.2023 18:57, Qi Zheng wrote:
>>>
> <...>
>>> How about this?
>>>>>
>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>>> index ffddbd204259..9d8c53075298 100644
>>>>> --- a/mm/vmscan.c
>>>>> +++ b/mm/vmscan.c
>>>>> @@ -1012,7 +1012,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>                                    int priority)
>>>>>    {
>>>>>           unsigned long ret, freed = 0;
>>>>> -       struct shrinker *shrinker;
>>>>> +       struct shrinker *shrinker = NULL;
>>>>>           int srcu_idx, generation;
>>>>>
>>>>>           /*
>>>>> @@ -1025,11 +1025,15 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>           if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>>>>>                   return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>>>
>>>>> +again:
>>>>>           srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>>
>>>>>           generation = atomic_read(&shrinker_srcu_generation);
>>>>> -       list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>>> -                                srcu_read_lock_held(&shrinker_srcu)) {
>>>>> +       if (!shrinker)
>>>>> +               shrinker = list_entry_rcu(shrinker_list.next, struct shrinker, list);
>>>>> +       else
>>>>> +               shrinker = list_entry_rcu(shrinker->list.next, struct shrinker, list);
>>>>> +       list_for_each_entry_from_rcu(shrinker, &shrinker_list, list) {
>>>>>                   struct shrink_control sc = {
>>>>>                           .gfp_mask = gfp_mask,
>>>>>                           .nid = nid,
>>>>> @@ -1042,8 +1046,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>                   freed += ret;
>>>>>
>>>>>                   if (atomic_read(&shrinker_srcu_generation) != generation) {
>>>>> -                       freed = freed ? : 1;
>>>>> -                       break;
>>>>> +                       srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>
>> After SRCU in unlocked we can't believe @shrinker anymore. So, above list_entry_rcu(shrinker->list.next)
>> dereferences some random memory.
> 
> Indeed.
> 
>>
>>>>> +                       cond_resched();
>>>>> +                       goto again;
>>>>>                   }
>>>>>           }
>>>>>
>>>>>>
>>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>>>> index 27ef9946ae8a..0b197bba1257 100644
>>>>>> --- a/mm/vmscan.c
>>>>>> +++ b/mm/vmscan.c
>>>>>> @@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>>>>     LIST_HEAD(shrinker_list);
>>>>>>     DEFINE_MUTEX(shrinker_mutex);
>>>>>>     DEFINE_SRCU(shrinker_srcu);
>>>>>> +static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
>>>>>>       #ifdef CONFIG_MEMCG
>>>>>>     static int shrinker_nr_max;
>>>>>> @@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
>>>>>>         debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>>>>         mutex_unlock(&shrinker_mutex);
>>>>>>     +    atomic_inc(&shrinker_srcu_generation);
>>>>>>         synchronize_srcu(&shrinker_srcu);
>>>>>>           debugfs_remove_recursive(debugfs_entry);
>>>>>> @@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
>>>>>>      */
>>>>>>     void synchronize_shrinkers(void)
>>>>>>     {
>>>>>> +    atomic_inc(&shrinker_srcu_generation);
>>>>>>         synchronize_srcu(&shrinker_srcu);
>>>>>>     }
>>>>>>     EXPORT_SYMBOL(synchronize_shrinkers);
>>>>>> @@ -908,18 +911,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>>>     {
>>>>>>         struct shrinker_info *info;
>>>>>>         unsigned long ret, freed = 0;
>>>>>> -    int srcu_idx;
>>>>>> -    int i;
>>>>>> +    int srcu_idx, generation;
>>>>>> +    int i = 0;
>>>>>>           if (!mem_cgroup_online(memcg))
>>>>>>             return 0;
>>>>>> -
>>>>>> +again:
>>>>>>         srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>>>         info = shrinker_info_srcu(memcg, nid);
>>>>>>         if (unlikely(!info))
>>>>>>             goto unlock;
>>>>>>     -    for_each_set_bit(i, info->map, info->map_nr_max) {
>>>>>> +    generation = atomic_read(&shrinker_srcu_generation);
>>>>>> +    for_each_set_bit_from(i, info->map, info->map_nr_max) {
>>>>>>             struct shrink_control sc = {
>>>>>>                 .gfp_mask = gfp_mask,
>>>>>>                 .nid = nid,
>>>>>> @@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>>>                     set_shrinker_bit(memcg, nid, i);
>>>>>>             }
>>>>>>             freed += ret;
>>>>>> +
>>>>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>>>>> +            srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>>
>>>>> Maybe we can add the following code here, so as to avoid repeating the
>>>>> current id and avoid triggering softlockup:
>>>>>
>>>>>               i++;
>>
>> This is OK.
>>
>>>>>               cond_resched();
>>
>> Possible, existing cond_resched() in do_shrink_slab() is enough.
> 
> Yeah.
> 
> I will add this patch in the next version. May I mark you as the author
> of this patch?

I think, yes

>>
>>> And this. :)
>>>
>>> Thanks,
>>> Qi
>>>
>>>>>
>>>>> Thanks,
>>>>> Qi
>>>>>
>>>>>> +            goto again;
>>>>>> +        }
>>>>>>         }
>>>>>>     unlock:
>>>>>>         srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>>> @@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>     {
>>>>>>         unsigned long ret, freed = 0;
>>>>>>         struct shrinker *shrinker;
>>>>>> -    int srcu_idx;
>>>>>> +    int srcu_idx, generation;
>>>>>>           /*
>>>>>>          * The root memcg might be allocated even though memcg is disabled
>>>>>> @@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>             return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>>>>           srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>>> +    generation = atomic_read(&shrinker_srcu_generation);
>>>>>>           list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>>>>                      srcu_read_lock_held(&shrinker_srcu)) {
>>>>>> @@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>             if (ret == SHRINK_EMPTY)
>>>>>>                 ret = 0;
>>>>>>             freed += ret;
>>>>>> +
>>>>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>>>>> +            freed = freed ? : 1;
>>>>>> +            break;
>>>>>> +        }
>>>>>>         }
>>>>>>           srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>>>
>>>>>>
>>>>>
>>>>
>>>
>>
>
Qi Zheng Feb. 26, 2023, 1:56 p.m. UTC | #19
On 2023/2/26 05:28, Kirill Tkhai wrote:
> On 25.02.2023 19:37, Qi Zheng wrote:
>>
>>
>> On 2023/2/26 00:17, Kirill Tkhai wrote:
>>> On 25.02.2023 18:57, Qi Zheng wrote:
>>>>
>> <...>
>>>> How about this?
>>>>>>
>>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>>>> index ffddbd204259..9d8c53075298 100644
>>>>>> --- a/mm/vmscan.c
>>>>>> +++ b/mm/vmscan.c
>>>>>> @@ -1012,7 +1012,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>                                     int priority)
>>>>>>     {
>>>>>>            unsigned long ret, freed = 0;
>>>>>> -       struct shrinker *shrinker;
>>>>>> +       struct shrinker *shrinker = NULL;
>>>>>>            int srcu_idx, generation;
>>>>>>
>>>>>>            /*
>>>>>> @@ -1025,11 +1025,15 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>            if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>>>>>>                    return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>>>>
>>>>>> +again:
>>>>>>            srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>>>
>>>>>>            generation = atomic_read(&shrinker_srcu_generation);
>>>>>> -       list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>>>> -                                srcu_read_lock_held(&shrinker_srcu)) {
>>>>>> +       if (!shrinker)
>>>>>> +               shrinker = list_entry_rcu(shrinker_list.next, struct shrinker, list);
>>>>>> +       else
>>>>>> +               shrinker = list_entry_rcu(shrinker->list.next, struct shrinker, list);
>>>>>> +       list_for_each_entry_from_rcu(shrinker, &shrinker_list, list) {
>>>>>>                    struct shrink_control sc = {
>>>>>>                            .gfp_mask = gfp_mask,
>>>>>>                            .nid = nid,
>>>>>> @@ -1042,8 +1046,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>                    freed += ret;
>>>>>>
>>>>>>                    if (atomic_read(&shrinker_srcu_generation) != generation) {
>>>>>> -                       freed = freed ? : 1;
>>>>>> -                       break;
>>>>>> +                       srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>
>>> After SRCU in unlocked we can't believe @shrinker anymore. So, above list_entry_rcu(shrinker->list.next)
>>> dereferences some random memory.
>>
>> Indeed.
>>
>>>
>>>>>> +                       cond_resched();
>>>>>> +                       goto again;
>>>>>>                    }
>>>>>>            }
>>>>>>
>>>>>>>
>>>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>>>>> index 27ef9946ae8a..0b197bba1257 100644
>>>>>>> --- a/mm/vmscan.c
>>>>>>> +++ b/mm/vmscan.c
>>>>>>> @@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>>>>>      LIST_HEAD(shrinker_list);
>>>>>>>      DEFINE_MUTEX(shrinker_mutex);
>>>>>>>      DEFINE_SRCU(shrinker_srcu);
>>>>>>> +static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
>>>>>>>        #ifdef CONFIG_MEMCG
>>>>>>>      static int shrinker_nr_max;
>>>>>>> @@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
>>>>>>>          debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>>>>>          mutex_unlock(&shrinker_mutex);
>>>>>>>      +    atomic_inc(&shrinker_srcu_generation);
>>>>>>>          synchronize_srcu(&shrinker_srcu);
>>>>>>>            debugfs_remove_recursive(debugfs_entry);
>>>>>>> @@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
>>>>>>>       */
>>>>>>>      void synchronize_shrinkers(void)
>>>>>>>      {
>>>>>>> +    atomic_inc(&shrinker_srcu_generation);
>>>>>>>          synchronize_srcu(&shrinker_srcu);
>>>>>>>      }
>>>>>>>      EXPORT_SYMBOL(synchronize_shrinkers);
>>>>>>> @@ -908,18 +911,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>>>>      {
>>>>>>>          struct shrinker_info *info;
>>>>>>>          unsigned long ret, freed = 0;
>>>>>>> -    int srcu_idx;
>>>>>>> -    int i;
>>>>>>> +    int srcu_idx, generation;
>>>>>>> +    int i = 0;
>>>>>>>            if (!mem_cgroup_online(memcg))
>>>>>>>              return 0;
>>>>>>> -
>>>>>>> +again:
>>>>>>>          srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>>>>          info = shrinker_info_srcu(memcg, nid);
>>>>>>>          if (unlikely(!info))
>>>>>>>              goto unlock;
>>>>>>>      -    for_each_set_bit(i, info->map, info->map_nr_max) {
>>>>>>> +    generation = atomic_read(&shrinker_srcu_generation);
>>>>>>> +    for_each_set_bit_from(i, info->map, info->map_nr_max) {
>>>>>>>              struct shrink_control sc = {
>>>>>>>                  .gfp_mask = gfp_mask,
>>>>>>>                  .nid = nid,
>>>>>>> @@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>>>>>                      set_shrinker_bit(memcg, nid, i);
>>>>>>>              }
>>>>>>>              freed += ret;
>>>>>>> +
>>>>>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>>>>>> +            srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>>>
>>>>>> Maybe we can add the following code here, so as to avoid repeating the
>>>>>> current id and avoid triggering softlockup:
>>>>>>
>>>>>>                i++;
>>>
>>> This is OK.
>>>
>>>>>>                cond_resched();
>>>
>>> Possible, existing cond_resched() in do_shrink_slab() is enough.
>>
>> Yeah.
>>
>> I will add this patch in the next version. May I mark you as the author
>> of this patch?
> 
> I think, yes

Thanks. :)

Qi

> 
>>>
>>>> And this. :)
>>>>
>>>> Thanks,
>>>> Qi
>>>>
>>>>>>
>>>>>> Thanks,
>>>>>> Qi
>>>>>>
>>>>>>> +            goto again;
>>>>>>> +        }
>>>>>>>          }
>>>>>>>      unlock:
>>>>>>>          srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>>>> @@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>>      {
>>>>>>>          unsigned long ret, freed = 0;
>>>>>>>          struct shrinker *shrinker;
>>>>>>> -    int srcu_idx;
>>>>>>> +    int srcu_idx, generation;
>>>>>>>            /*
>>>>>>>           * The root memcg might be allocated even though memcg is disabled
>>>>>>> @@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>>              return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>>>>>            srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>>>> +    generation = atomic_read(&shrinker_srcu_generation);
>>>>>>>            list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>>>>>                       srcu_read_lock_held(&shrinker_srcu)) {
>>>>>>> @@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>>              if (ret == SHRINK_EMPTY)
>>>>>>>                  ret = 0;
>>>>>>>              freed += ret;
>>>>>>> +
>>>>>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>>>>>> +            freed = freed ? : 1;
>>>>>>> +            break;
>>>>>>> +        }
>>>>>>>          }
>>>>>>>            srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>>>>
>>>>>>>
>>>>>>
>>>>>
>>>>
>>>
>>
>
diff mbox series

Patch

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9f895ca6216c..02987a6f95d1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -202,6 +202,7 @@  static void set_task_reclaim_state(struct task_struct *task,
 
 LIST_HEAD(shrinker_list);
 DECLARE_RWSEM(shrinker_rwsem);
+DEFINE_SRCU(shrinker_srcu);
 
 #ifdef CONFIG_MEMCG
 static int shrinker_nr_max;
@@ -706,7 +707,7 @@  void free_prealloced_shrinker(struct shrinker *shrinker)
 void register_shrinker_prepared(struct shrinker *shrinker)
 {
 	down_write(&shrinker_rwsem);
-	list_add_tail(&shrinker->list, &shrinker_list);
+	list_add_tail_rcu(&shrinker->list, &shrinker_list);
 	shrinker->flags |= SHRINKER_REGISTERED;
 	shrinker_debugfs_add(shrinker);
 	up_write(&shrinker_rwsem);
@@ -760,13 +761,15 @@  void unregister_shrinker(struct shrinker *shrinker)
 		return;
 
 	down_write(&shrinker_rwsem);
-	list_del(&shrinker->list);
+	list_del_rcu(&shrinker->list);
 	shrinker->flags &= ~SHRINKER_REGISTERED;
 	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
 		unregister_memcg_shrinker(shrinker);
 	debugfs_entry = shrinker_debugfs_remove(shrinker);
 	up_write(&shrinker_rwsem);
 
+	synchronize_srcu(&shrinker_srcu);
+
 	debugfs_remove_recursive(debugfs_entry);
 
 	kfree(shrinker->nr_deferred);
@@ -786,6 +789,7 @@  void synchronize_shrinkers(void)
 {
 	down_write(&shrinker_rwsem);
 	up_write(&shrinker_rwsem);
+	synchronize_srcu(&shrinker_srcu);
 }
 EXPORT_SYMBOL(synchronize_shrinkers);
 
@@ -996,6 +1000,7 @@  static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 {
 	unsigned long ret, freed = 0;
 	struct shrinker *shrinker;
+	int srcu_idx;
 
 	/*
 	 * The root memcg might be allocated even though memcg is disabled
@@ -1007,10 +1012,10 @@  static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 	if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
 		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
 
-	if (!down_read_trylock(&shrinker_rwsem))
-		goto out;
+	srcu_idx = srcu_read_lock(&shrinker_srcu);
 
-	list_for_each_entry(shrinker, &shrinker_list, list) {
+	list_for_each_entry_srcu(shrinker, &shrinker_list, list,
+				 srcu_read_lock_held(&shrinker_srcu)) {
 		struct shrink_control sc = {
 			.gfp_mask = gfp_mask,
 			.nid = nid,
@@ -1021,19 +1026,9 @@  static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 		if (ret == SHRINK_EMPTY)
 			ret = 0;
 		freed += ret;
-		/*
-		 * Bail out if someone want to register a new shrinker to
-		 * prevent the registration from being stalled for long periods
-		 * by parallel ongoing shrinking.
-		 */
-		if (rwsem_is_contended(&shrinker_rwsem)) {
-			freed = freed ? : 1;
-			break;
-		}
 	}
 
-	up_read(&shrinker_rwsem);
-out:
+	srcu_read_unlock(&shrinker_srcu, srcu_idx);
 	cond_resched();
 	return freed;
 }