Message ID | 20240624175313.47329-6-ryncsn@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Split list_lru lock into per-cgroup scope | expand |
> On Jun 25, 2024, at 01:53, Kairui Song <ryncsn@gmail.com> wrote: > > From: Kairui Song <kasong@tencent.com> > > Currently, there is a lot of code for detecting reparent racing > using kmemcg_id as the synchronization flag. And an intermediate > table is required to record and compare the kmemcg_id. > > We can simplify this by just checking the cgroup css status, skip > if cgroup is being offlined. On the reparenting side, ensure no > more allocation is on going and no further allocation will occur > by using the XArray lock as barrier. > > Combined with a O(n^2) top-down walk for the allocation, we get rid > of the intermediate table allocation completely. Despite being O(n^2), > it should be actually faster because it's not practical to have a very > deep cgroup level. I kind of agree with you. But just instinctually. > > This also avoided changing kmemcg_id before reparenting, making > cgroups have a stable index for list_lru_memcg. After this change > it's possible that a dying cgroup will see a NULL value in XArray > corresponding to the kmemcg_id, because the kmemcg_id will point > to an empty slot. In such case, just fallback to use its parent. > > As a result the code is simpler, following test also showed a > performance gain (6 test runs): > > mkdir /tmp/test-fs > modprobe brd rd_nr=1 rd_size=16777216 > mkfs.xfs /dev/ram0 > mount -t xfs /dev/ram0 /tmp/test-fs > worker() { > echo TEST-CONTENT > "/tmp/test-fs/$1" > } > do_test() { > for i in $(seq 1 2048); do > (exec sh -c 'echo "$PPID"') > "/sys/fs/cgroup/benchmark/$i/cgroup.procs" > worker "$i" & > done; wait > echo 1 > /proc/sys/vm/drop_caches > } > mkdir -p /sys/fs/cgroup/benchmark > echo +memory > /sys/fs/cgroup/benchmark/cgroup.subtree_control > for i in $(seq 1 2048); do > rmdir "/sys/fs/cgroup/benchmark/$i" &>/dev/null > mkdir -p "/sys/fs/cgroup/benchmark/$i" > done > time do_test > > Before: > real 0m5.932s user 0m2.366s sys 0m5.583s > real 0m5.939s user 0m2.347s sys 0m5.597s > real 0m6.149s user 0m2.398s sys 0m5.761s > real 0m5.945s user 0m2.403s sys 0m5.547s > real 0m5.925s user 0m2.293s sys 0m5.651s > real 0m6.017s user 0m2.367s sys 0m5.686s > > After: > real 0m5.712s user 0m2.343s sys 0m5.307s > real 0m5.885s user 0m2.326s sys 0m5.518s > real 0m5.694s user 0m2.347s sys 0m5.264s > real 0m5.865s user 0m2.300s sys 0m5.545s > real 0m5.748s user 0m2.273s sys 0m5.424s > real 0m5.756s user 0m2.318s sys 0m5.398s > > Signed-off-by: Kairui Song <kasong@tencent.com> > --- > mm/list_lru.c | 182 ++++++++++++++++++++++---------------------------- > mm/zswap.c | 7 +- > 2 files changed, 81 insertions(+), 108 deletions(-) > > diff --git a/mm/list_lru.c b/mm/list_lru.c > index 4c619857e916..ac8aec8451dd 100644 > --- a/mm/list_lru.c > +++ b/mm/list_lru.c > @@ -59,6 +59,20 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) > } > return &lru->node[nid].lru; > } > + > +static inline struct list_lru_one * > +list_lru_from_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg) > +{ > + struct list_lru_one *l; > +again: > + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); > + if (likely(l)) > + return l; > + > + memcg = parent_mem_cgroup(memcg); > + WARN_ON(!css_is_dying(&memcg->css)); > + goto again; Just want to confirm this will only loop max twice, right? > +} > #else > static void list_lru_register(struct list_lru *lru) > { > @@ -83,6 +97,12 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) > { > return &lru->node[nid].lru; > } > + > +static inline struct list_lru_one * > +list_lru_from_memcg(struct list_lru *lru, int nid, int idx) > +{ > + return &lru->node[nid].lru; > +} > #endif /* CONFIG_MEMCG_KMEM */ > > bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, > @@ -93,7 +113,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, > > spin_lock(&nlru->lock); > if (list_empty(item)) { > - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); > + l = list_lru_from_memcg(lru, nid, memcg); > list_add_tail(item, &l->list); > /* Set shrinker bit if the first element was added */ > if (!l->nr_items++) > @@ -124,7 +144,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, > > spin_lock(&nlru->lock); > if (!list_empty(item)) { > - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); > + l = list_lru_from_memcg(lru, nid, memcg); > list_del_init(item); > l->nr_items--; > nlru->nr_items--; > @@ -339,20 +359,6 @@ static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) > return mlru; > } > > -static void memcg_list_lru_free(struct list_lru *lru, int src_idx) > -{ > - struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx); > - > - /* > - * The __list_lru_walk_one() can walk the list of this node. > - * We need kvfree_rcu() here. And the walking of the list > - * is under lru->node[nid]->lock, which can serve as a RCU > - * read-side critical section. > - */ > - if (mlru) > - kvfree_rcu(mlru, rcu); > -} > - > static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) > { > if (memcg_aware) > @@ -377,22 +383,18 @@ static void memcg_destroy_list_lru(struct list_lru *lru) > } > > static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, > - int src_idx, struct mem_cgroup *dst_memcg) > + struct list_lru_one *src, > + struct mem_cgroup *dst_memcg) > { > struct list_lru_node *nlru = &lru->node[nid]; > - int dst_idx = dst_memcg->kmemcg_id; > - struct list_lru_one *src, *dst; > + struct list_lru_one *dst; > > /* > * Since list_lru_{add,del} may be called under an IRQ-safe lock, > * we have to use IRQ-safe primitives here to avoid deadlock. > */ > spin_lock_irq(&nlru->lock); > - > - src = list_lru_from_memcg_idx(lru, nid, src_idx); > - if (!src) > - goto out; > - dst = list_lru_from_memcg_idx(lru, nid, dst_idx); > + dst = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(dst_memcg)); > > list_splice_init(&src->list, &dst->list); > > @@ -401,46 +403,45 @@ static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, > set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); > src->nr_items = 0; > } > -out: > spin_unlock_irq(&nlru->lock); > } > > void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) > { > - struct cgroup_subsys_state *css; > struct list_lru *lru; > - int src_idx = memcg->kmemcg_id, i; > - > - /* > - * Change kmemcg_id of this cgroup and all its descendants to the > - * parent's id, and then move all entries from this cgroup's list_lrus > - * to ones of the parent. > - */ > - rcu_read_lock(); > - css_for_each_descendant_pre(css, &memcg->css) { > - struct mem_cgroup *child; > - > - child = mem_cgroup_from_css(css); > - WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id); > - } > - rcu_read_unlock(); > + int i; > > - /* > - * With kmemcg_id set to parent, holding the lru lock below can > - * prevent list_lru_{add,del,isolate} from touching the lru, safe > - * to reparent. > - */ > mutex_lock(&list_lrus_mutex); > list_for_each_entry(lru, &memcg_list_lrus, list) { > + struct list_lru_memcg *mlru; > + XA_STATE(xas, &lru->xa, memcg->kmemcg_id); > + > + /* > + * Lock the Xarray to ensure no on going allocation and ongoing? I'd like to rewrite the comment here, like: The XArray lock is used to make the future observers will see the current memory cgroup has been dying (i.e. css_is_dying() will return true), which is significant for memcg_list_lru_alloc() to make sure it will not allocate a new mlru for this died memory cgroup for which we just free it below. > + * further allocation will see css_is_dying(). > + */ > + xas_lock_irq(&xas); > + mlru = xas_load(&xas); > + if (mlru) > + xas_store(&xas, NULL); > + xas_unlock_irq(&xas); The above block could be replaced with xa_erase_irq(), simpler, right? > + if (!mlru) > + continue; > + > + /* > + * With Xarray value set to NULL, holding the lru lock below > + * prevents list_lru_{add,del,isolate} from touching the lru, > + * safe to reparent. > + */ > for_each_node(i) > - memcg_reparent_list_lru_node(lru, i, src_idx, parent); > + memcg_reparent_list_lru_node(lru, i, &mlru->node[i], parent); > > /* > * Here all list_lrus corresponding to the cgroup are guaranteed > * to remain empty, we can safely free this lru, any further > * memcg_list_lru_alloc() call will simply bail out. > */ > - memcg_list_lru_free(lru, src_idx); > + kvfree_rcu(mlru, rcu); > } > mutex_unlock(&list_lrus_mutex); > } > @@ -456,78 +457,51 @@ static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, > int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, > gfp_t gfp) > { > - int i; > unsigned long flags; > - struct list_lru_memcg_table { > - struct list_lru_memcg *mlru; > - struct mem_cgroup *memcg; > - } *table; > + struct list_lru_memcg *mlru; > + struct mem_cgroup *pos, *parent; > XA_STATE(xas, &lru->xa, 0); > > if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) > return 0; > > gfp &= GFP_RECLAIM_MASK; > - table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp); > - if (!table) > - return -ENOMEM; > - > /* > * Because the list_lru can be reparented to the parent cgroup's > * list_lru, we should make sure that this cgroup and all its > * ancestors have allocated list_lru_memcg. > */ > - for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) { > - if (memcg_list_lru_allocated(memcg, lru)) > - break; > - > - table[i].memcg = memcg; > - table[i].mlru = memcg_init_list_lru_one(gfp); > - if (!table[i].mlru) { > - while (i--) > - kfree(table[i].mlru); > - kfree(table); > - return -ENOMEM; > + do { > + /* > + * Keep finding the farest parent that wasn't populated > + * until found memcg itself. > + */ > + pos = memcg; > + parent = parent_mem_cgroup(pos); > + while (parent && !memcg_list_lru_allocated(parent, lru)) { The check of @parent is unnecessary since memcg_list_lru_allocated() always return true for root memory cgroup. Right? > + pos = parent; > + parent = parent_mem_cgroup(pos); > } > - } > > - xas_lock_irqsave(&xas, flags); > - while (i--) { > - int index = READ_ONCE(table[i].memcg->kmemcg_id); > - struct list_lru_memcg *mlru = table[i].mlru; > - > - xas_set(&xas, index); > -retry: > - if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) { > - kfree(mlru); > - } else { > - xas_store(&xas, mlru); > - if (xas_error(&xas) == -ENOMEM) { > + mlru = memcg_init_list_lru_one(gfp); > + do { > + bool alloced = false; > + > + xas_set(&xas, pos->kmemcg_id); > + xas_lock_irqsave(&xas, flags); > + if (!css_is_dying(&pos->css) && !xas_load(&xas)) { > + xas_store(&xas, mlru); > + alloced = true; > + } > + if (!alloced || xas_error(&xas)) { > xas_unlock_irqrestore(&xas, flags); > - if (xas_nomem(&xas, gfp)) > - xas_set_err(&xas, 0); > - xas_lock_irqsave(&xas, flags); > - /* > - * The xas lock has been released, this memcg > - * can be reparented before us. So reload > - * memcg id. More details see the comments > - * in memcg_reparent_list_lrus(). > - */ > - index = READ_ONCE(table[i].memcg->kmemcg_id); > - if (index < 0) > - xas_set_err(&xas, 0); > - else if (!xas_error(&xas) && index != xas.xa_index) > - xas_set(&xas, index); > - goto retry; > + kfree(mlru); > + goto out; 1) If xas_error() returns true, we should continue since xas_mem() will handle the NOMEM case for us. 2) If xas_load() returns a non-NULL pointer meaning there is a concurrent thread has assigned a new mlru for us, we should continue since we might have not assigned a mlru for the leaf memory cgroup. Suppose the following case: A / \ B C If there are two threads allocating mlru for A, then either B or C will not allocate a mlru. Here is a code snippet FYI. Thanks. ```c struct list_lru_memcg *mlru = NULL; do { /* ... */ if (!mlru) mlru = memcg_init_list_lru_one(gfp); xas_set(&xas, pos->kmemcg_id); do { xas_lock_irqsave(&xas, flags); if (css_is_dying(&pos->css) || xas_load(&xas)) goto unlock; xas_store(&xas, mlru); if (!xas_error(&xas)) mlru = NULL; unlock: xas_unlock_irqrestore(&xas, flags); } while (xas_nomem(&xas, gfp)); } while (pos != memcg); if (mlru) kfree(mlru); /* ... */ ``` > } > - } > - } > - /* xas_nomem() is used to free memory instead of memory allocation. */ > - if (xas.xa_alloc) > - xas_nomem(&xas, gfp); > - xas_unlock_irqrestore(&xas, flags); > - kfree(table); > - > + xas_unlock_irqrestore(&xas, flags); > + } while (xas_nomem(&xas, gfp)); > + } while (pos != memcg); > +out: > return xas_error(&xas); > } > #else > diff --git a/mm/zswap.c b/mm/zswap.c > index a50e2986cd2f..c6e2256347ff 100644 > --- a/mm/zswap.c > +++ b/mm/zswap.c > @@ -718,12 +718,11 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) > > /* > * Note that it is safe to use rcu_read_lock() here, even in the face of > - * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection > - * used in list_lru lookup, only two scenarios are possible: > + * concurrent memcg offlining: > * > - * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The > + * 1. list_lru_add() is called before list_lru_memcg is erased. The > * new entry will be reparented to memcg's parent's list_lru. > - * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The > + * 2. list_lru_add() is called after list_lru_memcg is erased. The > * new entry will be added directly to memcg's parent's list_lru. > * > * Similar reasoning holds for list_lru_del(). > -- > 2.45.2 >
On Wed, Jul 17, 2024 at 11:05 AM Muchun Song <muchun.song@linux.dev> wrote: > > On Jun 25, 2024, at 01:53, Kairui Song <ryncsn@gmail.com> wrote: > > > > From: Kairui Song <kasong@tencent.com> > > > > Currently, there is a lot of code for detecting reparent racing > > using kmemcg_id as the synchronization flag. And an intermediate > > table is required to record and compare the kmemcg_id. > > > > We can simplify this by just checking the cgroup css status, skip > > if cgroup is being offlined. On the reparenting side, ensure no > > more allocation is on going and no further allocation will occur > > by using the XArray lock as barrier. > > > > Combined with a O(n^2) top-down walk for the allocation, we get rid > > of the intermediate table allocation completely. Despite being O(n^2), > > it should be actually faster because it's not practical to have a very > > deep cgroup level. > > I kind of agree with you. But just instinctually. > > > > > This also avoided changing kmemcg_id before reparenting, making > > cgroups have a stable index for list_lru_memcg. After this change > > it's possible that a dying cgroup will see a NULL value in XArray > > corresponding to the kmemcg_id, because the kmemcg_id will point > > to an empty slot. In such case, just fallback to use its parent. > > > > As a result the code is simpler, following test also showed a > > performance gain (6 test runs): > > > > mkdir /tmp/test-fs > > modprobe brd rd_nr=1 rd_size=16777216 > > mkfs.xfs /dev/ram0 > > mount -t xfs /dev/ram0 /tmp/test-fs > > worker() { > > echo TEST-CONTENT > "/tmp/test-fs/$1" > > } > > do_test() { > > for i in $(seq 1 2048); do > > (exec sh -c 'echo "$PPID"') > "/sys/fs/cgroup/benchmark/$i/cgroup.procs" > > worker "$i" & > > done; wait > > echo 1 > /proc/sys/vm/drop_caches > > } > > mkdir -p /sys/fs/cgroup/benchmark > > echo +memory > /sys/fs/cgroup/benchmark/cgroup.subtree_control > > for i in $(seq 1 2048); do > > rmdir "/sys/fs/cgroup/benchmark/$i" &>/dev/null > > mkdir -p "/sys/fs/cgroup/benchmark/$i" > > done > > time do_test > > > > Before: > > real 0m5.932s user 0m2.366s sys 0m5.583s > > real 0m5.939s user 0m2.347s sys 0m5.597s > > real 0m6.149s user 0m2.398s sys 0m5.761s > > real 0m5.945s user 0m2.403s sys 0m5.547s > > real 0m5.925s user 0m2.293s sys 0m5.651s > > real 0m6.017s user 0m2.367s sys 0m5.686s > > > > After: > > real 0m5.712s user 0m2.343s sys 0m5.307s > > real 0m5.885s user 0m2.326s sys 0m5.518s > > real 0m5.694s user 0m2.347s sys 0m5.264s > > real 0m5.865s user 0m2.300s sys 0m5.545s > > real 0m5.748s user 0m2.273s sys 0m5.424s > > real 0m5.756s user 0m2.318s sys 0m5.398s > > > > Signed-off-by: Kairui Song <kasong@tencent.com> > > --- > > mm/list_lru.c | 182 ++++++++++++++++++++++---------------------------- > > mm/zswap.c | 7 +- > > 2 files changed, 81 insertions(+), 108 deletions(-) > > > > diff --git a/mm/list_lru.c b/mm/list_lru.c > > index 4c619857e916..ac8aec8451dd 100644 > > --- a/mm/list_lru.c > > +++ b/mm/list_lru.c > > @@ -59,6 +59,20 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) > > } > > return &lru->node[nid].lru; > > } > > + > > +static inline struct list_lru_one * > > +list_lru_from_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg) > > +{ > > + struct list_lru_one *l; > > +again: > > + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); > > + if (likely(l)) > > + return l; > > + > > + memcg = parent_mem_cgroup(memcg); > > + WARN_ON(!css_is_dying(&memcg->css)); > > + goto again; > > Just want to confirm this will only loop max twice, right? If the memcg is dying, it finds the nearest parent that is still active and returns the parent's lru. If the parent is also dying, try the parent's parent. It may repeat mult times until it reaches root cg but very unlikely. > > > +} > > #else > > static void list_lru_register(struct list_lru *lru) > > { > > @@ -83,6 +97,12 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) > > { > > return &lru->node[nid].lru; > > } > > + > > +static inline struct list_lru_one * > > +list_lru_from_memcg(struct list_lru *lru, int nid, int idx) > > +{ > > + return &lru->node[nid].lru; > > +} > > #endif /* CONFIG_MEMCG_KMEM */ > > > > bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, > > @@ -93,7 +113,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, > > > > spin_lock(&nlru->lock); > > if (list_empty(item)) { > > - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); > > + l = list_lru_from_memcg(lru, nid, memcg); > > list_add_tail(item, &l->list); > > /* Set shrinker bit if the first element was added */ > > if (!l->nr_items++) > > @@ -124,7 +144,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, > > > > spin_lock(&nlru->lock); > > if (!list_empty(item)) { > > - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); > > + l = list_lru_from_memcg(lru, nid, memcg); > > list_del_init(item); > > l->nr_items--; > > nlru->nr_items--; > > @@ -339,20 +359,6 @@ static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) > > return mlru; > > } > > > > -static void memcg_list_lru_free(struct list_lru *lru, int src_idx) > > -{ > > - struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx); > > - > > - /* > > - * The __list_lru_walk_one() can walk the list of this node. > > - * We need kvfree_rcu() here. And the walking of the list > > - * is under lru->node[nid]->lock, which can serve as a RCU > > - * read-side critical section. > > - */ > > - if (mlru) > > - kvfree_rcu(mlru, rcu); > > -} > > - > > static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) > > { > > if (memcg_aware) > > @@ -377,22 +383,18 @@ static void memcg_destroy_list_lru(struct list_lru *lru) > > } > > > > static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, > > - int src_idx, struct mem_cgroup *dst_memcg) > > + struct list_lru_one *src, > > + struct mem_cgroup *dst_memcg) > > { > > struct list_lru_node *nlru = &lru->node[nid]; > > - int dst_idx = dst_memcg->kmemcg_id; > > - struct list_lru_one *src, *dst; > > + struct list_lru_one *dst; > > > > /* > > * Since list_lru_{add,del} may be called under an IRQ-safe lock, > > * we have to use IRQ-safe primitives here to avoid deadlock. > > */ > > spin_lock_irq(&nlru->lock); > > - > > - src = list_lru_from_memcg_idx(lru, nid, src_idx); > > - if (!src) > > - goto out; > > - dst = list_lru_from_memcg_idx(lru, nid, dst_idx); > > + dst = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(dst_memcg)); > > > > list_splice_init(&src->list, &dst->list); > > > > @@ -401,46 +403,45 @@ static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, > > set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); > > src->nr_items = 0; > > } > > -out: > > spin_unlock_irq(&nlru->lock); > > } > > > > void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) > > { > > - struct cgroup_subsys_state *css; > > struct list_lru *lru; > > - int src_idx = memcg->kmemcg_id, i; > > - > > - /* > > - * Change kmemcg_id of this cgroup and all its descendants to the > > - * parent's id, and then move all entries from this cgroup's list_lrus > > - * to ones of the parent. > > - */ > > - rcu_read_lock(); > > - css_for_each_descendant_pre(css, &memcg->css) { > > - struct mem_cgroup *child; > > - > > - child = mem_cgroup_from_css(css); > > - WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id); > > - } > > - rcu_read_unlock(); > > + int i; > > > > - /* > > - * With kmemcg_id set to parent, holding the lru lock below can > > - * prevent list_lru_{add,del,isolate} from touching the lru, safe > > - * to reparent. > > - */ > > mutex_lock(&list_lrus_mutex); > > list_for_each_entry(lru, &memcg_list_lrus, list) { > > + struct list_lru_memcg *mlru; > > + XA_STATE(xas, &lru->xa, memcg->kmemcg_id); > > + > > + /* > > + * Lock the Xarray to ensure no on going allocation and > > ongoing? I'd like to rewrite the comment here, like: > > The XArray lock is used to make the future observers will see the current > memory cgroup has been dying (i.e. css_is_dying() will return true), which > is significant for memcg_list_lru_alloc() to make sure it will not allocate > a new mlru for this died memory cgroup for which we just free it below. Good suggestion. > > > + * further allocation will see css_is_dying(). > > + */ > > + xas_lock_irq(&xas); > > + mlru = xas_load(&xas); > > + if (mlru) > > + xas_store(&xas, NULL); > > + xas_unlock_irq(&xas); > > The above block could be replaced with xa_erase_irq(), simpler, right? I wanted to reuse the `xas` here, it will be used by later commits and this helps reduce total stack usage. Might be trivial, I can use xa_erase_irq here for easier coding, and expand this to this again later. > > > + if (!mlru) > > + continue; > > + > > + /* > > + * With Xarray value set to NULL, holding the lru lock below > > + * prevents list_lru_{add,del,isolate} from touching the lru, > > + * safe to reparent. > > + */ > > for_each_node(i) > > - memcg_reparent_list_lru_node(lru, i, src_idx, parent); > > + memcg_reparent_list_lru_node(lru, i, &mlru->node[i], parent); > > > > /* > > * Here all list_lrus corresponding to the cgroup are guaranteed > > * to remain empty, we can safely free this lru, any further > > * memcg_list_lru_alloc() call will simply bail out. > > */ > > - memcg_list_lru_free(lru, src_idx); > > + kvfree_rcu(mlru, rcu); > > } > > mutex_unlock(&list_lrus_mutex); > > } > > @@ -456,78 +457,51 @@ static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, > > int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, > > gfp_t gfp) > > { > > - int i; > > unsigned long flags; > > - struct list_lru_memcg_table { > > - struct list_lru_memcg *mlru; > > - struct mem_cgroup *memcg; > > - } *table; > > + struct list_lru_memcg *mlru; > > + struct mem_cgroup *pos, *parent; > > XA_STATE(xas, &lru->xa, 0); > > > > if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) > > return 0; > > > > gfp &= GFP_RECLAIM_MASK; > > - table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp); > > - if (!table) > > - return -ENOMEM; > > - > > /* > > * Because the list_lru can be reparented to the parent cgroup's > > * list_lru, we should make sure that this cgroup and all its > > * ancestors have allocated list_lru_memcg. > > */ > > - for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) { > > - if (memcg_list_lru_allocated(memcg, lru)) > > - break; > > - > > - table[i].memcg = memcg; > > - table[i].mlru = memcg_init_list_lru_one(gfp); > > - if (!table[i].mlru) { > > - while (i--) > > - kfree(table[i].mlru); > > - kfree(table); > > - return -ENOMEM; > > + do { > > + /* > > + * Keep finding the farest parent that wasn't populated > > + * until found memcg itself. > > + */ > > + pos = memcg; > > + parent = parent_mem_cgroup(pos); > > + while (parent && !memcg_list_lru_allocated(parent, lru)) { > > The check of @parent is unnecessary since memcg_list_lru_allocated() always > return true for root memory cgroup. Right? Right, just wrote this to be less error prone as memcg_list_lru_allocated can't handle NULL. But yeah, parent = NULL shouldn't happen here, it can be removed. > > > + pos = parent; > > + parent = parent_mem_cgroup(pos); > > } > > - } > > > > - xas_lock_irqsave(&xas, flags); > > - while (i--) { > > - int index = READ_ONCE(table[i].memcg->kmemcg_id); > > - struct list_lru_memcg *mlru = table[i].mlru; > > - > > - xas_set(&xas, index); > > -retry: > > - if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) { > > - kfree(mlru); > > - } else { > > - xas_store(&xas, mlru); > > - if (xas_error(&xas) == -ENOMEM) { > > + mlru = memcg_init_list_lru_one(gfp); > > + do { > > + bool alloced = false; > > + > > + xas_set(&xas, pos->kmemcg_id); > > + xas_lock_irqsave(&xas, flags); > > + if (!css_is_dying(&pos->css) && !xas_load(&xas)) { > > + xas_store(&xas, mlru); > > + alloced = true; > > + } > > + if (!alloced || xas_error(&xas)) { > > xas_unlock_irqrestore(&xas, flags); > > - if (xas_nomem(&xas, gfp)) > > - xas_set_err(&xas, 0); > > - xas_lock_irqsave(&xas, flags); > > - /* > > - * The xas lock has been released, this memcg > > - * can be reparented before us. So reload > > - * memcg id. More details see the comments > > - * in memcg_reparent_list_lrus(). > > - */ > > - index = READ_ONCE(table[i].memcg->kmemcg_id); > > - if (index < 0) > > - xas_set_err(&xas, 0); > > - else if (!xas_error(&xas) && index != xas.xa_index) > > - xas_set(&xas, index); > > - goto retry; > > + kfree(mlru); > > + goto out; > > 1) If xas_error() returns true, we should continue since xas_mem() will handle the NOMEM > case for us. Oh, My bad, I think I lost an intermediate commit for this part and ended up exiting too early on error case; and it also needs to check !mlru case above and return -ENOMEM, will update this part. > > 2) If xas_load() returns a non-NULL pointer meaning there is a concurrent thread has > assigned a new mlru for us, we should continue since we might have not assigned a mlru > for the leaf memory cgroup. Suppose the following case: > > A > / \ > B C > > If there are two threads allocating mlru for A, then either B or C will not allocate a mlru. > Here is a code snippet FYI. Nice find, forgot the case when multiple children could race for one parent. > > Thanks. > > ```c > struct list_lru_memcg *mlru = NULL; > > do { > /* ... */ > if (!mlru) > mlru = memcg_init_list_lru_one(gfp); > > xas_set(&xas, pos->kmemcg_id); > do { > xas_lock_irqsave(&xas, flags); > if (css_is_dying(&pos->css) || xas_load(&xas)) > goto unlock; > xas_store(&xas, mlru); > if (!xas_error(&xas)) > mlru = NULL; > unlock: > xas_unlock_irqrestore(&xas, flags); > } while (xas_nomem(&xas, gfp)); > } while (pos != memcg); > > if (mlru) > kfree(mlru); Good suggestion, one more thing is, should it abort the iteration if the memcg is dead? Considering handling the memcg_init_list_lru_one failure, so the loop could be this? + mlru = NULL; + do { + pos = memcg; + parent = parent_mem_cgroup(pos); + while (!memcg_list_lru_allocated(parent, lru)) { + pos = parent; + parent = parent_mem_cgroup(pos); + } + + mlru = mlru ?: memcg_init_list_lru_one(gfp); + if (!mlru) + return -ENOMEM; + xas_set(&xas, pos->kmemcg_id); + do { + xas_lock_irqsave(&xas, flags); + if (!xas_load(&xas) && !css_is_dying(&pos->css)) { + xas_store(&xas, mlru); + if (!xas_error(&xas)) + mlru = NULL; + } + xas_unlock_irqrestore(&xas, flags); + } while (xas_nomem(&xas, gfp)); + } while (!css_is_dying(&pos->css) && pos != memcg); + + if (mlru) + kfree(mlru);
> On Jul 18, 2024, at 19:49, Kairui Song <ryncsn@gmail.com> wrote: > > On Wed, Jul 17, 2024 at 11:05 AM Muchun Song <muchun.song@linux.dev> wrote: >>> On Jun 25, 2024, at 01:53, Kairui Song <ryncsn@gmail.com> wrote: >>> >>> From: Kairui Song <kasong@tencent.com> >>> >>> Currently, there is a lot of code for detecting reparent racing >>> using kmemcg_id as the synchronization flag. And an intermediate >>> table is required to record and compare the kmemcg_id. >>> >>> We can simplify this by just checking the cgroup css status, skip >>> if cgroup is being offlined. On the reparenting side, ensure no >>> more allocation is on going and no further allocation will occur >>> by using the XArray lock as barrier. >>> >>> Combined with a O(n^2) top-down walk for the allocation, we get rid >>> of the intermediate table allocation completely. Despite being O(n^2), >>> it should be actually faster because it's not practical to have a very >>> deep cgroup level. >> >> I kind of agree with you. But just instinctually. >> >>> >>> This also avoided changing kmemcg_id before reparenting, making >>> cgroups have a stable index for list_lru_memcg. After this change >>> it's possible that a dying cgroup will see a NULL value in XArray >>> corresponding to the kmemcg_id, because the kmemcg_id will point >>> to an empty slot. In such case, just fallback to use its parent. >>> >>> As a result the code is simpler, following test also showed a >>> performance gain (6 test runs): >>> >>> mkdir /tmp/test-fs >>> modprobe brd rd_nr=1 rd_size=16777216 >>> mkfs.xfs /dev/ram0 >>> mount -t xfs /dev/ram0 /tmp/test-fs >>> worker() { >>> echo TEST-CONTENT > "/tmp/test-fs/$1" >>> } >>> do_test() { >>> for i in $(seq 1 2048); do >>> (exec sh -c 'echo "$PPID"') > "/sys/fs/cgroup/benchmark/$i/cgroup.procs" >>> worker "$i" & >>> done; wait >>> echo 1 > /proc/sys/vm/drop_caches >>> } >>> mkdir -p /sys/fs/cgroup/benchmark >>> echo +memory > /sys/fs/cgroup/benchmark/cgroup.subtree_control >>> for i in $(seq 1 2048); do >>> rmdir "/sys/fs/cgroup/benchmark/$i" &>/dev/null >>> mkdir -p "/sys/fs/cgroup/benchmark/$i" >>> done >>> time do_test >>> >>> Before: >>> real 0m5.932s user 0m2.366s sys 0m5.583s >>> real 0m5.939s user 0m2.347s sys 0m5.597s >>> real 0m6.149s user 0m2.398s sys 0m5.761s >>> real 0m5.945s user 0m2.403s sys 0m5.547s >>> real 0m5.925s user 0m2.293s sys 0m5.651s >>> real 0m6.017s user 0m2.367s sys 0m5.686s >>> >>> After: >>> real 0m5.712s user 0m2.343s sys 0m5.307s >>> real 0m5.885s user 0m2.326s sys 0m5.518s >>> real 0m5.694s user 0m2.347s sys 0m5.264s >>> real 0m5.865s user 0m2.300s sys 0m5.545s >>> real 0m5.748s user 0m2.273s sys 0m5.424s >>> real 0m5.756s user 0m2.318s sys 0m5.398s >>> >>> Signed-off-by: Kairui Song <kasong@tencent.com> >>> --- >>> mm/list_lru.c | 182 ++++++++++++++++++++++---------------------------- >>> mm/zswap.c | 7 +- >>> 2 files changed, 81 insertions(+), 108 deletions(-) >>> >>> diff --git a/mm/list_lru.c b/mm/list_lru.c >>> index 4c619857e916..ac8aec8451dd 100644 >>> --- a/mm/list_lru.c >>> +++ b/mm/list_lru.c >>> @@ -59,6 +59,20 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) >>> } >>> return &lru->node[nid].lru; >>> } >>> + >>> +static inline struct list_lru_one * >>> +list_lru_from_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg) >>> +{ >>> + struct list_lru_one *l; >>> +again: >>> + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); >>> + if (likely(l)) >>> + return l; >>> + >>> + memcg = parent_mem_cgroup(memcg); >>> + WARN_ON(!css_is_dying(&memcg->css)); >>> + goto again; >> >> Just want to confirm this will only loop max twice, right? > > If the memcg is dying, it finds the nearest parent that is still > active and returns the parent's lru. If the parent is also dying, try > the parent's parent. It may repeat mult times until it reaches root cg > but very unlikely. Right. > >> >>> +} >>> #else >>> static void list_lru_register(struct list_lru *lru) >>> { >>> @@ -83,6 +97,12 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) >>> { >>> return &lru->node[nid].lru; >>> } >>> + >>> +static inline struct list_lru_one * >>> +list_lru_from_memcg(struct list_lru *lru, int nid, int idx) >>> +{ >>> + return &lru->node[nid].lru; >>> +} >>> #endif /* CONFIG_MEMCG_KMEM */ >>> >>> bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, >>> @@ -93,7 +113,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, >>> >>> spin_lock(&nlru->lock); >>> if (list_empty(item)) { >>> - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); >>> + l = list_lru_from_memcg(lru, nid, memcg); >>> list_add_tail(item, &l->list); >>> /* Set shrinker bit if the first element was added */ >>> if (!l->nr_items++) >>> @@ -124,7 +144,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, >>> >>> spin_lock(&nlru->lock); >>> if (!list_empty(item)) { >>> - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); >>> + l = list_lru_from_memcg(lru, nid, memcg); >>> list_del_init(item); >>> l->nr_items--; >>> nlru->nr_items--; >>> @@ -339,20 +359,6 @@ static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) >>> return mlru; >>> } >>> >>> -static void memcg_list_lru_free(struct list_lru *lru, int src_idx) >>> -{ >>> - struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx); >>> - >>> - /* >>> - * The __list_lru_walk_one() can walk the list of this node. >>> - * We need kvfree_rcu() here. And the walking of the list >>> - * is under lru->node[nid]->lock, which can serve as a RCU >>> - * read-side critical section. >>> - */ >>> - if (mlru) >>> - kvfree_rcu(mlru, rcu); >>> -} >>> - >>> static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) >>> { >>> if (memcg_aware) >>> @@ -377,22 +383,18 @@ static void memcg_destroy_list_lru(struct list_lru *lru) >>> } >>> >>> static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, >>> - int src_idx, struct mem_cgroup *dst_memcg) >>> + struct list_lru_one *src, >>> + struct mem_cgroup *dst_memcg) >>> { >>> struct list_lru_node *nlru = &lru->node[nid]; >>> - int dst_idx = dst_memcg->kmemcg_id; >>> - struct list_lru_one *src, *dst; >>> + struct list_lru_one *dst; >>> >>> /* >>> * Since list_lru_{add,del} may be called under an IRQ-safe lock, >>> * we have to use IRQ-safe primitives here to avoid deadlock. >>> */ >>> spin_lock_irq(&nlru->lock); >>> - >>> - src = list_lru_from_memcg_idx(lru, nid, src_idx); >>> - if (!src) >>> - goto out; >>> - dst = list_lru_from_memcg_idx(lru, nid, dst_idx); >>> + dst = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(dst_memcg)); >>> >>> list_splice_init(&src->list, &dst->list); >>> >>> @@ -401,46 +403,45 @@ static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, >>> set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); >>> src->nr_items = 0; >>> } >>> -out: >>> spin_unlock_irq(&nlru->lock); >>> } >>> >>> void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) >>> { >>> - struct cgroup_subsys_state *css; >>> struct list_lru *lru; >>> - int src_idx = memcg->kmemcg_id, i; >>> - >>> - /* >>> - * Change kmemcg_id of this cgroup and all its descendants to the >>> - * parent's id, and then move all entries from this cgroup's list_lrus >>> - * to ones of the parent. >>> - */ >>> - rcu_read_lock(); >>> - css_for_each_descendant_pre(css, &memcg->css) { >>> - struct mem_cgroup *child; >>> - >>> - child = mem_cgroup_from_css(css); >>> - WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id); >>> - } >>> - rcu_read_unlock(); >>> + int i; >>> >>> - /* >>> - * With kmemcg_id set to parent, holding the lru lock below can >>> - * prevent list_lru_{add,del,isolate} from touching the lru, safe >>> - * to reparent. >>> - */ >>> mutex_lock(&list_lrus_mutex); >>> list_for_each_entry(lru, &memcg_list_lrus, list) { >>> + struct list_lru_memcg *mlru; >>> + XA_STATE(xas, &lru->xa, memcg->kmemcg_id); >>> + >>> + /* >>> + * Lock the Xarray to ensure no on going allocation and >> >> ongoing? I'd like to rewrite the comment here, like: >> >> The XArray lock is used to make the future observers will see the current >> memory cgroup has been dying (i.e. css_is_dying() will return true), which >> is significant for memcg_list_lru_alloc() to make sure it will not allocate >> a new mlru for this died memory cgroup for which we just free it below. > > Good suggestion. > >> >>> + * further allocation will see css_is_dying(). >>> + */ >>> + xas_lock_irq(&xas); >>> + mlru = xas_load(&xas); >>> + if (mlru) >>> + xas_store(&xas, NULL); >>> + xas_unlock_irq(&xas); >> >> The above block could be replaced with xa_erase_irq(), simpler, right? > > I wanted to reuse the `xas` here, it will be used by later commits and > this helps reduce total stack usage. Might be trivial, I can use > xa_erase_irq here for easier coding, and expand this to this again > later. Yes. > >> >>> + if (!mlru) >>> + continue; >>> + >>> + /* >>> + * With Xarray value set to NULL, holding the lru lock below >>> + * prevents list_lru_{add,del,isolate} from touching the lru, >>> + * safe to reparent. >>> + */ >>> for_each_node(i) >>> - memcg_reparent_list_lru_node(lru, i, src_idx, parent); >>> + memcg_reparent_list_lru_node(lru, i, &mlru->node[i], parent); >>> >>> /* >>> * Here all list_lrus corresponding to the cgroup are guaranteed >>> * to remain empty, we can safely free this lru, any further >>> * memcg_list_lru_alloc() call will simply bail out. >>> */ >>> - memcg_list_lru_free(lru, src_idx); >>> + kvfree_rcu(mlru, rcu); >>> } >>> mutex_unlock(&list_lrus_mutex); >>> } >>> @@ -456,78 +457,51 @@ static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, >>> int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, >>> gfp_t gfp) >>> { >>> - int i; >>> unsigned long flags; >>> - struct list_lru_memcg_table { >>> - struct list_lru_memcg *mlru; >>> - struct mem_cgroup *memcg; >>> - } *table; >>> + struct list_lru_memcg *mlru; >>> + struct mem_cgroup *pos, *parent; >>> XA_STATE(xas, &lru->xa, 0); >>> >>> if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) >>> return 0; >>> >>> gfp &= GFP_RECLAIM_MASK; >>> - table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp); >>> - if (!table) >>> - return -ENOMEM; >>> - >>> /* >>> * Because the list_lru can be reparented to the parent cgroup's >>> * list_lru, we should make sure that this cgroup and all its >>> * ancestors have allocated list_lru_memcg. >>> */ >>> - for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) { >>> - if (memcg_list_lru_allocated(memcg, lru)) >>> - break; >>> - >>> - table[i].memcg = memcg; >>> - table[i].mlru = memcg_init_list_lru_one(gfp); >>> - if (!table[i].mlru) { >>> - while (i--) >>> - kfree(table[i].mlru); >>> - kfree(table); >>> - return -ENOMEM; >>> + do { >>> + /* >>> + * Keep finding the farest parent that wasn't populated >>> + * until found memcg itself. >>> + */ >>> + pos = memcg; >>> + parent = parent_mem_cgroup(pos); >>> + while (parent && !memcg_list_lru_allocated(parent, lru)) { >> >> The check of @parent is unnecessary since memcg_list_lru_allocated() always >> return true for root memory cgroup. Right? > > Right, just wrote this to be less error prone as > memcg_list_lru_allocated can't handle NULL. But yeah, parent = NULL > shouldn't happen here, it can be removed. > >> >>> + pos = parent; >>> + parent = parent_mem_cgroup(pos); >>> } >>> - } >>> >>> - xas_lock_irqsave(&xas, flags); >>> - while (i--) { >>> - int index = READ_ONCE(table[i].memcg->kmemcg_id); >>> - struct list_lru_memcg *mlru = table[i].mlru; >>> - >>> - xas_set(&xas, index); >>> -retry: >>> - if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) { >>> - kfree(mlru); >>> - } else { >>> - xas_store(&xas, mlru); >>> - if (xas_error(&xas) == -ENOMEM) { >>> + mlru = memcg_init_list_lru_one(gfp); >>> + do { >>> + bool alloced = false; >>> + >>> + xas_set(&xas, pos->kmemcg_id); >>> + xas_lock_irqsave(&xas, flags); >>> + if (!css_is_dying(&pos->css) && !xas_load(&xas)) { >>> + xas_store(&xas, mlru); >>> + alloced = true; >>> + } >>> + if (!alloced || xas_error(&xas)) { >>> xas_unlock_irqrestore(&xas, flags); >>> - if (xas_nomem(&xas, gfp)) >>> - xas_set_err(&xas, 0); >>> - xas_lock_irqsave(&xas, flags); >>> - /* >>> - * The xas lock has been released, this memcg >>> - * can be reparented before us. So reload >>> - * memcg id. More details see the comments >>> - * in memcg_reparent_list_lrus(). >>> - */ >>> - index = READ_ONCE(table[i].memcg->kmemcg_id); >>> - if (index < 0) >>> - xas_set_err(&xas, 0); >>> - else if (!xas_error(&xas) && index != xas.xa_index) >>> - xas_set(&xas, index); >>> - goto retry; >>> + kfree(mlru); >>> + goto out; >> >> 1) If xas_error() returns true, we should continue since xas_mem() will handle the NOMEM >> case for us. > > Oh, My bad, I think I lost an intermediate commit for this part and > ended up exiting too early on error case; and it also needs to check > !mlru case above and return -ENOMEM, will update this part. > >> >> 2) If xas_load() returns a non-NULL pointer meaning there is a concurrent thread has >> assigned a new mlru for us, we should continue since we might have not assigned a mlru >> for the leaf memory cgroup. Suppose the following case: >> >> A >> / \ >> B C >> >> If there are two threads allocating mlru for A, then either B or C will not allocate a mlru. >> Here is a code snippet FYI. > > Nice find, forgot the case when multiple children could race for one parent. > >> >> Thanks. >> >> ```c >> struct list_lru_memcg *mlru = NULL; >> >> do { >> /* ... */ >> if (!mlru) >> mlru = memcg_init_list_lru_one(gfp); >> >> xas_set(&xas, pos->kmemcg_id); >> do { >> xas_lock_irqsave(&xas, flags); >> if (css_is_dying(&pos->css) || xas_load(&xas)) >> goto unlock; >> xas_store(&xas, mlru); >> if (!xas_error(&xas)) >> mlru = NULL; >> unlock: >> xas_unlock_irqrestore(&xas, flags); >> } while (xas_nomem(&xas, gfp)); >> } while (pos != memcg); >> >> if (mlru) >> kfree(mlru); > > Good suggestion, one more thing is, should it abort the iteration if > the memcg is dead? Considering handling the memcg_init_list_lru_one > failure, so the loop could be this? Agree. > > + mlru = NULL; > + do { > + pos = memcg; > + parent = parent_mem_cgroup(pos); > + while (!memcg_list_lru_allocated(parent, lru)) { > + pos = parent; > + parent = parent_mem_cgroup(pos); > + } > + > + mlru = mlru ?: memcg_init_list_lru_one(gfp); > + if (!mlru) > + return -ENOMEM; > + xas_set(&xas, pos->kmemcg_id); > + do { > + xas_lock_irqsave(&xas, flags); > + if (!xas_load(&xas) && !css_is_dying(&pos->css)) { > + xas_store(&xas, mlru); > + if (!xas_error(&xas)) > + mlru = NULL; > + } > + xas_unlock_irqrestore(&xas, flags); > + } while (xas_nomem(&xas, gfp)); > + } while (!css_is_dying(&pos->css) && pos != memcg); > + > + if (mlru) > + kfree(mlru);
diff --git a/mm/list_lru.c b/mm/list_lru.c index 4c619857e916..ac8aec8451dd 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -59,6 +59,20 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) } return &lru->node[nid].lru; } + +static inline struct list_lru_one * +list_lru_from_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg) +{ + struct list_lru_one *l; +again: + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); + if (likely(l)) + return l; + + memcg = parent_mem_cgroup(memcg); + WARN_ON(!css_is_dying(&memcg->css)); + goto again; +} #else static void list_lru_register(struct list_lru *lru) { @@ -83,6 +97,12 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { return &lru->node[nid].lru; } + +static inline struct list_lru_one * +list_lru_from_memcg(struct list_lru *lru, int nid, int idx) +{ + return &lru->node[nid].lru; +} #endif /* CONFIG_MEMCG_KMEM */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, @@ -93,7 +113,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, spin_lock(&nlru->lock); if (list_empty(item)) { - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); + l = list_lru_from_memcg(lru, nid, memcg); list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) @@ -124,7 +144,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, spin_lock(&nlru->lock); if (!list_empty(item)) { - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); + l = list_lru_from_memcg(lru, nid, memcg); list_del_init(item); l->nr_items--; nlru->nr_items--; @@ -339,20 +359,6 @@ static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) return mlru; } -static void memcg_list_lru_free(struct list_lru *lru, int src_idx) -{ - struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx); - - /* - * The __list_lru_walk_one() can walk the list of this node. - * We need kvfree_rcu() here. And the walking of the list - * is under lru->node[nid]->lock, which can serve as a RCU - * read-side critical section. - */ - if (mlru) - kvfree_rcu(mlru, rcu); -} - static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { if (memcg_aware) @@ -377,22 +383,18 @@ static void memcg_destroy_list_lru(struct list_lru *lru) } static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, - int src_idx, struct mem_cgroup *dst_memcg) + struct list_lru_one *src, + struct mem_cgroup *dst_memcg) { struct list_lru_node *nlru = &lru->node[nid]; - int dst_idx = dst_memcg->kmemcg_id; - struct list_lru_one *src, *dst; + struct list_lru_one *dst; /* * Since list_lru_{add,del} may be called under an IRQ-safe lock, * we have to use IRQ-safe primitives here to avoid deadlock. */ spin_lock_irq(&nlru->lock); - - src = list_lru_from_memcg_idx(lru, nid, src_idx); - if (!src) - goto out; - dst = list_lru_from_memcg_idx(lru, nid, dst_idx); + dst = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(dst_memcg)); list_splice_init(&src->list, &dst->list); @@ -401,46 +403,45 @@ static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); src->nr_items = 0; } -out: spin_unlock_irq(&nlru->lock); } void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) { - struct cgroup_subsys_state *css; struct list_lru *lru; - int src_idx = memcg->kmemcg_id, i; - - /* - * Change kmemcg_id of this cgroup and all its descendants to the - * parent's id, and then move all entries from this cgroup's list_lrus - * to ones of the parent. - */ - rcu_read_lock(); - css_for_each_descendant_pre(css, &memcg->css) { - struct mem_cgroup *child; - - child = mem_cgroup_from_css(css); - WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id); - } - rcu_read_unlock(); + int i; - /* - * With kmemcg_id set to parent, holding the lru lock below can - * prevent list_lru_{add,del,isolate} from touching the lru, safe - * to reparent. - */ mutex_lock(&list_lrus_mutex); list_for_each_entry(lru, &memcg_list_lrus, list) { + struct list_lru_memcg *mlru; + XA_STATE(xas, &lru->xa, memcg->kmemcg_id); + + /* + * Lock the Xarray to ensure no on going allocation and + * further allocation will see css_is_dying(). + */ + xas_lock_irq(&xas); + mlru = xas_load(&xas); + if (mlru) + xas_store(&xas, NULL); + xas_unlock_irq(&xas); + if (!mlru) + continue; + + /* + * With Xarray value set to NULL, holding the lru lock below + * prevents list_lru_{add,del,isolate} from touching the lru, + * safe to reparent. + */ for_each_node(i) - memcg_reparent_list_lru_node(lru, i, src_idx, parent); + memcg_reparent_list_lru_node(lru, i, &mlru->node[i], parent); /* * Here all list_lrus corresponding to the cgroup are guaranteed * to remain empty, we can safely free this lru, any further * memcg_list_lru_alloc() call will simply bail out. */ - memcg_list_lru_free(lru, src_idx); + kvfree_rcu(mlru, rcu); } mutex_unlock(&list_lrus_mutex); } @@ -456,78 +457,51 @@ static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp) { - int i; unsigned long flags; - struct list_lru_memcg_table { - struct list_lru_memcg *mlru; - struct mem_cgroup *memcg; - } *table; + struct list_lru_memcg *mlru; + struct mem_cgroup *pos, *parent; XA_STATE(xas, &lru->xa, 0); if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) return 0; gfp &= GFP_RECLAIM_MASK; - table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp); - if (!table) - return -ENOMEM; - /* * Because the list_lru can be reparented to the parent cgroup's * list_lru, we should make sure that this cgroup and all its * ancestors have allocated list_lru_memcg. */ - for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) { - if (memcg_list_lru_allocated(memcg, lru)) - break; - - table[i].memcg = memcg; - table[i].mlru = memcg_init_list_lru_one(gfp); - if (!table[i].mlru) { - while (i--) - kfree(table[i].mlru); - kfree(table); - return -ENOMEM; + do { + /* + * Keep finding the farest parent that wasn't populated + * until found memcg itself. + */ + pos = memcg; + parent = parent_mem_cgroup(pos); + while (parent && !memcg_list_lru_allocated(parent, lru)) { + pos = parent; + parent = parent_mem_cgroup(pos); } - } - xas_lock_irqsave(&xas, flags); - while (i--) { - int index = READ_ONCE(table[i].memcg->kmemcg_id); - struct list_lru_memcg *mlru = table[i].mlru; - - xas_set(&xas, index); -retry: - if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) { - kfree(mlru); - } else { - xas_store(&xas, mlru); - if (xas_error(&xas) == -ENOMEM) { + mlru = memcg_init_list_lru_one(gfp); + do { + bool alloced = false; + + xas_set(&xas, pos->kmemcg_id); + xas_lock_irqsave(&xas, flags); + if (!css_is_dying(&pos->css) && !xas_load(&xas)) { + xas_store(&xas, mlru); + alloced = true; + } + if (!alloced || xas_error(&xas)) { xas_unlock_irqrestore(&xas, flags); - if (xas_nomem(&xas, gfp)) - xas_set_err(&xas, 0); - xas_lock_irqsave(&xas, flags); - /* - * The xas lock has been released, this memcg - * can be reparented before us. So reload - * memcg id. More details see the comments - * in memcg_reparent_list_lrus(). - */ - index = READ_ONCE(table[i].memcg->kmemcg_id); - if (index < 0) - xas_set_err(&xas, 0); - else if (!xas_error(&xas) && index != xas.xa_index) - xas_set(&xas, index); - goto retry; + kfree(mlru); + goto out; } - } - } - /* xas_nomem() is used to free memory instead of memory allocation. */ - if (xas.xa_alloc) - xas_nomem(&xas, gfp); - xas_unlock_irqrestore(&xas, flags); - kfree(table); - + xas_unlock_irqrestore(&xas, flags); + } while (xas_nomem(&xas, gfp)); + } while (pos != memcg); +out: return xas_error(&xas); } #else diff --git a/mm/zswap.c b/mm/zswap.c index a50e2986cd2f..c6e2256347ff 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -718,12 +718,11 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) /* * Note that it is safe to use rcu_read_lock() here, even in the face of - * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection - * used in list_lru lookup, only two scenarios are possible: + * concurrent memcg offlining: * - * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The + * 1. list_lru_add() is called before list_lru_memcg is erased. The * new entry will be reparented to memcg's parent's list_lru. - * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The + * 2. list_lru_add() is called after list_lru_memcg is erased. The * new entry will be added directly to memcg's parent's list_lru. * * Similar reasoning holds for list_lru_del().