Message ID | 20210927074823.5825-1-sultan@kerneltoast.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm: vmscan: Replace shrinker_rwsem trylocks with SRCU protection | expand |
On Mon 27-09-21 00:48:23, Sultan Alsawaf wrote: > From: Sultan Alsawaf <sultan@kerneltoast.com> > > The shrinker rwsem is problematic because the actual shrinking path must > back off when contention appears, causing some or all shrinkers to be > skipped. This can be especially bad when shrinkers are frequently > registered and unregistered. A high frequency of shrinker registrations/ > unregistrations can effectively DoS the shrinker mechanism, rendering it > useless. Can you be more specific about those scenarios please? > Using SRCU to protect iteration through the shrinker list and idr > eliminates this issue entirely. Now, shrinking can happen concurrently > with shrinker registrations/unregistrations. I have a vague recollection that this approach has been proposed in the past. Have you checked previous attempts? > Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com> > --- > mm/vmscan.c | 68 ++++++++++++++++++++++++++++------------------------- > 1 file changed, 36 insertions(+), 32 deletions(-) > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 74296c2d1fed..3dea927be5ad 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -190,6 +190,7 @@ static void set_task_reclaim_state(struct task_struct *task, > > static LIST_HEAD(shrinker_list); > static DECLARE_RWSEM(shrinker_rwsem); > +DEFINE_STATIC_SRCU(shrinker_srcu); > > #ifdef CONFIG_MEMCG > static int shrinker_nr_max; > @@ -212,6 +213,20 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, > lockdep_is_held(&shrinker_rwsem)); > } > > +static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, > + int nid) > +{ > + return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info, > + &shrinker_srcu); > +} > + > +static void free_shrinker_info_srcu(struct rcu_head *head) > +{ > + struct shrinker_info *info = container_of(head, typeof(*info), rcu); > + > + kvfree_rcu(info, rcu); > +} > + > static int expand_one_shrinker_info(struct mem_cgroup *memcg, > int map_size, int defer_size, > int old_map_size, int old_defer_size) > @@ -244,7 +259,12 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, > defer_size - old_defer_size); > > rcu_assign_pointer(pn->shrinker_info, new); > - kvfree_rcu(old, rcu); > + > + /* > + * Shrinker info is used under both SRCU and regular RCU, so > + * synchronize the free against both of them. > + */ > + call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_srcu); > } > > return 0; > @@ -357,7 +377,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) > return -ENOSYS; > > down_write(&shrinker_rwsem); > - /* This may call shrinker, so it must use down_read_trylock() */ > id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); > if (id < 0) > goto unlock; > @@ -391,7 +410,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, > { > struct shrinker_info *info; > > - info = shrinker_info_protected(memcg, nid); > + info = shrinker_info_srcu(memcg, nid); > return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); > } > > @@ -400,7 +419,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, > { > struct shrinker_info *info; > > - info = shrinker_info_protected(memcg, nid); > + info = shrinker_info_srcu(memcg, nid); > return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); > } > > @@ -641,6 +660,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker) > down_write(&shrinker_rwsem); > unregister_memcg_shrinker(shrinker); > up_write(&shrinker_rwsem); > + synchronize_srcu(&shrinker_srcu); > return; > } > > @@ -651,7 +671,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker) > void register_shrinker_prepared(struct shrinker *shrinker) > { > down_write(&shrinker_rwsem); > - list_add_tail(&shrinker->list, &shrinker_list); > + list_add_tail_rcu(&shrinker->list, &shrinker_list); > shrinker->flags |= SHRINKER_REGISTERED; > up_write(&shrinker_rwsem); > } > @@ -676,11 +696,12 @@ void unregister_shrinker(struct shrinker *shrinker) > return; > > down_write(&shrinker_rwsem); > - list_del(&shrinker->list); > + list_del_rcu(&shrinker->list); > shrinker->flags &= ~SHRINKER_REGISTERED; > if (shrinker->flags & SHRINKER_MEMCG_AWARE) > unregister_memcg_shrinker(shrinker); > up_write(&shrinker_rwsem); > + synchronize_srcu(&shrinker_srcu); > > kfree(shrinker->nr_deferred); > shrinker->nr_deferred = NULL; > @@ -792,15 +813,13 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, > { > struct shrinker_info *info; > unsigned long ret, freed = 0; > - int i; > + int i, srcu_idx; > > if (!mem_cgroup_online(memcg)) > return 0; > > - if (!down_read_trylock(&shrinker_rwsem)) > - return 0; > - > - info = shrinker_info_protected(memcg, nid); > + srcu_idx = srcu_read_lock(&shrinker_srcu); > + info = shrinker_info_srcu(memcg, nid); > if (unlikely(!info)) > goto unlock; > > @@ -850,14 +869,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, > set_shrinker_bit(memcg, nid, i); > } > freed += ret; > - > - if (rwsem_is_contended(&shrinker_rwsem)) { > - freed = freed ? : 1; > - break; > - } > } > unlock: > - up_read(&shrinker_rwsem); > + srcu_read_unlock(&shrinker_srcu, srcu_idx); > return freed; > } > #else /* CONFIG_MEMCG */ > @@ -894,6 +908,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, > { > unsigned long ret, freed = 0; > struct shrinker *shrinker; > + int srcu_idx; > > /* > * The root memcg might be allocated even though memcg is disabled > @@ -905,10 +920,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, > if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) > return shrink_slab_memcg(gfp_mask, nid, memcg, priority); > > - if (!down_read_trylock(&shrinker_rwsem)) > - goto out; > - > - list_for_each_entry(shrinker, &shrinker_list, list) { > + srcu_idx = srcu_read_lock(&shrinker_srcu); > + list_for_each_entry_srcu(shrinker, &shrinker_list, list, > + srcu_read_lock_held(&shrinker_srcu)) { > struct shrink_control sc = { > .gfp_mask = gfp_mask, > .nid = nid, > @@ -919,19 +933,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, > if (ret == SHRINK_EMPTY) > ret = 0; > freed += ret; > - /* > - * Bail out if someone want to register a new shrinker to > - * prevent the registration from being stalled for long periods > - * by parallel ongoing shrinking. > - */ > - if (rwsem_is_contended(&shrinker_rwsem)) { > - freed = freed ? : 1; > - break; > - } > } > + srcu_read_unlock(&shrinker_srcu, srcu_idx); > > - up_read(&shrinker_rwsem); > -out: > cond_resched(); > return freed; > } > -- > 2.33.0
On Mon, Sep 27, 2021 at 10:36:36AM +0200, Michal Hocko wrote: > Can you be more specific about those scenarios please? I see frequent tmpfs mounts/unmounts in Android which result in lots of superblocks being created and destroyed, and each superblock has its own shrinker. > I have a vague recollection that this approach has been proposed in the > past. Have you checked previous attempts? I wasn't aware that there were previous attempts. Is this [1] the previous attempt you're thinking of? It seems like the thread just died out. Was the main gripe that always enabling CONFIG_SRCU increased the kernel's size too much when tinyfication was desired? Sultan [1] https://lore.kernel.org/all/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
On Mon 27-09-21 09:53:59, Sultan Alsawaf wrote: > On Mon, Sep 27, 2021 at 10:36:36AM +0200, Michal Hocko wrote: > > Can you be more specific about those scenarios please? > > I see frequent tmpfs mounts/unmounts in Android which result in lots of > superblocks being created and destroyed, and each superblock has its own > shrinker. This is an important detail to mention in the changelog. Including some details about the scale of the problem. > > I have a vague recollection that this approach has been proposed in the > > past. Have you checked previous attempts? > > I wasn't aware that there were previous attempts. Is this [1] the previous > attempt you're thinking of? It seems like the thread just died out. Was the main > gripe that always enabling CONFIG_SRCU increased the kernel's size too much when > tinyfication was desired? There were more discussions. I have also found https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp My memory is quite dim and I do not have time to re-read those discussions again right now. But if I remember correctly the last one has died out because a deeper evaluation between the SRCU and rw semaphores was required. > Sultan > > [1] https://lore.kernel.org/all/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
diff --git a/mm/vmscan.c b/mm/vmscan.c index 74296c2d1fed..3dea927be5ad 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -190,6 +190,7 @@ static void set_task_reclaim_state(struct task_struct *task, static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); +DEFINE_STATIC_SRCU(shrinker_srcu); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -212,6 +213,20 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, lockdep_is_held(&shrinker_rwsem)); } +static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, + int nid) +{ + return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info, + &shrinker_srcu); +} + +static void free_shrinker_info_srcu(struct rcu_head *head) +{ + struct shrinker_info *info = container_of(head, typeof(*info), rcu); + + kvfree_rcu(info, rcu); +} + static int expand_one_shrinker_info(struct mem_cgroup *memcg, int map_size, int defer_size, int old_map_size, int old_defer_size) @@ -244,7 +259,12 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, defer_size - old_defer_size); rcu_assign_pointer(pn->shrinker_info, new); - kvfree_rcu(old, rcu); + + /* + * Shrinker info is used under both SRCU and regular RCU, so + * synchronize the free against both of them. + */ + call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_srcu); } return 0; @@ -357,7 +377,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) return -ENOSYS; down_write(&shrinker_rwsem); - /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -391,7 +410,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_protected(memcg, nid); + info = shrinker_info_srcu(memcg, nid); return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); } @@ -400,7 +419,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_protected(memcg, nid); + info = shrinker_info_srcu(memcg, nid); return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); } @@ -641,6 +660,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker) down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); up_write(&shrinker_rwsem); + synchronize_srcu(&shrinker_srcu); return; } @@ -651,7 +671,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker) void register_shrinker_prepared(struct shrinker *shrinker) { down_write(&shrinker_rwsem); - list_add_tail(&shrinker->list, &shrinker_list); + list_add_tail_rcu(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; up_write(&shrinker_rwsem); } @@ -676,11 +696,12 @@ void unregister_shrinker(struct shrinker *shrinker) return; down_write(&shrinker_rwsem); - list_del(&shrinker->list); + list_del_rcu(&shrinker->list); shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); up_write(&shrinker_rwsem); + synchronize_srcu(&shrinker_srcu); kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; @@ -792,15 +813,13 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, { struct shrinker_info *info; unsigned long ret, freed = 0; - int i; + int i, srcu_idx; if (!mem_cgroup_online(memcg)) return 0; - if (!down_read_trylock(&shrinker_rwsem)) - return 0; - - info = shrinker_info_protected(memcg, nid); + srcu_idx = srcu_read_lock(&shrinker_srcu); + info = shrinker_info_srcu(memcg, nid); if (unlikely(!info)) goto unlock; @@ -850,14 +869,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, set_shrinker_bit(memcg, nid, i); } freed += ret; - - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; - } } unlock: - up_read(&shrinker_rwsem); + srcu_read_unlock(&shrinker_srcu, srcu_idx); return freed; } #else /* CONFIG_MEMCG */ @@ -894,6 +908,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, { unsigned long ret, freed = 0; struct shrinker *shrinker; + int srcu_idx; /* * The root memcg might be allocated even though memcg is disabled @@ -905,10 +920,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); - if (!down_read_trylock(&shrinker_rwsem)) - goto out; - - list_for_each_entry(shrinker, &shrinker_list, list) { + srcu_idx = srcu_read_lock(&shrinker_srcu); + list_for_each_entry_srcu(shrinker, &shrinker_list, list, + srcu_read_lock_held(&shrinker_srcu)) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -919,19 +933,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (ret == SHRINK_EMPTY) ret = 0; freed += ret; - /* - * Bail out if someone want to register a new shrinker to - * prevent the registration from being stalled for long periods - * by parallel ongoing shrinking. - */ - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; - } } + srcu_read_unlock(&shrinker_srcu, srcu_idx); - up_read(&shrinker_rwsem); -out: cond_resched(); return freed; }