diff mbox series

[v2,4/5] mm: make memcg visible to lru walker isolation function

Message ID 1577174006-13025-5-git-send-email-laoar.shao@gmail.com (mailing list archive)
State New, archived
Headers show
Series protect page cache from freeing inode | expand

Commit Message

Yafang Shao Dec. 24, 2019, 7:53 a.m. UTC
The lru walker isolation function may use this memcg to do something, e.g.
the inode isolatation function will use the memcg to do inode protection in
followup patch. So make memcg visible to the lru walker isolation function.

Something should be emphasized in this patch is it replaces
for_each_memcg_cache_index() with for_each_mem_cgroup() in
list_lru_walk_node(). Because there's a gap between these two MACROs that
for_each_mem_cgroup() depends on CONFIG_MEMCG while the other one depends
on CONFIG_MEMCG_KMEM. But as list_lru_memcg_aware() returns false if
CONFIG_MEMCG_KMEM is not configured, it is safe to this replacement.

Cc: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 include/linux/memcontrol.h | 21 +++++++++++++++++++++
 mm/list_lru.c              | 22 ++++++++++++----------
 mm/memcontrol.c            | 15 ---------------
 3 files changed, 33 insertions(+), 25 deletions(-)

Comments

Dave Chinner Jan. 4, 2020, 3:35 a.m. UTC | #1
On Tue, Dec 24, 2019 at 02:53:25AM -0500, Yafang Shao wrote:
> The lru walker isolation function may use this memcg to do something, e.g.
> the inode isolatation function will use the memcg to do inode protection in
> followup patch. So make memcg visible to the lru walker isolation function.
> 
> Something should be emphasized in this patch is it replaces
> for_each_memcg_cache_index() with for_each_mem_cgroup() in
> list_lru_walk_node(). Because there's a gap between these two MACROs that
> for_each_mem_cgroup() depends on CONFIG_MEMCG while the other one depends
> on CONFIG_MEMCG_KMEM. But as list_lru_memcg_aware() returns false if
> CONFIG_MEMCG_KMEM is not configured, it is safe to this replacement.
> 
> Cc: Dave Chinner <dchinner@redhat.com>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>

....

> @@ -299,17 +299,15 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
>  				 list_lru_walk_cb isolate, void *cb_arg,
>  				 unsigned long *nr_to_walk)
>  {
> +	struct mem_cgroup *memcg;
>  	long isolated = 0;
> -	int memcg_idx;
>  
> -	isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
> -				      nr_to_walk);
> -	if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
> -		for_each_memcg_cache_index(memcg_idx) {
> +	if (list_lru_memcg_aware(lru)) {
> +		for_each_mem_cgroup(memcg) {
>  			struct list_lru_node *nlru = &lru->node[nid];
>  
>  			spin_lock(&nlru->lock);
> -			isolated += __list_lru_walk_one(nlru, memcg_idx,
> +			isolated += __list_lru_walk_one(nlru, memcg,
>  							isolate, cb_arg,
>  							nr_to_walk);
>  			spin_unlock(&nlru->lock);
> @@ -317,7 +315,11 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
>  			if (*nr_to_walk <= 0)
>  				break;
>  		}
> +	} else {
> +		isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
> +					      nr_to_walk);
>  	}
> +

That's a change of behaviour. The old code always runs per-node
reclaim, then if the LRU is memcg aware it also runs the memcg
aware reclaim. The new code never runs global per-node reclaim
if the list is memcg aware, so shrinkers that are initialised
with the flags SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE seem
likely to have reclaim problems with mixed memcg/global memory
pressure scenarios.

e.g. if all the memory is in the per-node lists, and the memcg needs
to reclaim memory because of a global shortage, it is now unable to
reclaim global memory.....

Cheers,

Dave.
Yafang Shao Jan. 4, 2020, 7:26 a.m. UTC | #2
On Sat, Jan 4, 2020 at 11:36 AM Dave Chinner <david@fromorbit.com> wrote:
>
> On Tue, Dec 24, 2019 at 02:53:25AM -0500, Yafang Shao wrote:
> > The lru walker isolation function may use this memcg to do something, e.g.
> > the inode isolatation function will use the memcg to do inode protection in
> > followup patch. So make memcg visible to the lru walker isolation function.
> >
> > Something should be emphasized in this patch is it replaces
> > for_each_memcg_cache_index() with for_each_mem_cgroup() in
> > list_lru_walk_node(). Because there's a gap between these two MACROs that
> > for_each_mem_cgroup() depends on CONFIG_MEMCG while the other one depends
> > on CONFIG_MEMCG_KMEM. But as list_lru_memcg_aware() returns false if
> > CONFIG_MEMCG_KMEM is not configured, it is safe to this replacement.
> >
> > Cc: Dave Chinner <dchinner@redhat.com>
> > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
>
> ....
>
> > @@ -299,17 +299,15 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
> >                                list_lru_walk_cb isolate, void *cb_arg,
> >                                unsigned long *nr_to_walk)
> >  {
> > +     struct mem_cgroup *memcg;
> >       long isolated = 0;
> > -     int memcg_idx;
> >
> > -     isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
> > -                                   nr_to_walk);
> > -     if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
> > -             for_each_memcg_cache_index(memcg_idx) {
> > +     if (list_lru_memcg_aware(lru)) {
> > +             for_each_mem_cgroup(memcg) {
> >                       struct list_lru_node *nlru = &lru->node[nid];
> >
> >                       spin_lock(&nlru->lock);
> > -                     isolated += __list_lru_walk_one(nlru, memcg_idx,
> > +                     isolated += __list_lru_walk_one(nlru, memcg,
> >                                                       isolate, cb_arg,
> >                                                       nr_to_walk);
> >                       spin_unlock(&nlru->lock);
> > @@ -317,7 +315,11 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
> >                       if (*nr_to_walk <= 0)
> >                               break;
> >               }
> > +     } else {
> > +             isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
> > +                                           nr_to_walk);
> >       }
> > +
>
> That's a change of behaviour. The old code always runs per-node
> reclaim, then if the LRU is memcg aware it also runs the memcg
> aware reclaim. The new code never runs global per-node reclaim
> if the list is memcg aware, so shrinkers that are initialised
> with the flags SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE seem
> likely to have reclaim problems with mixed memcg/global memory
> pressure scenarios.
>
> e.g. if all the memory is in the per-node lists, and the memcg needs
> to reclaim memory because of a global shortage, it is now unable to
> reclaim global memory.....
>

Hi Dave,

Thanks for your detailed explanation.
But I have different understanding.
The difference between for_each_mem_cgroup(memcg) and
for_each_memcg_cache_index(memcg_idx) is that the
for_each_mem_cgroup() includes the root_mem_cgroup while the
for_each_memcg_cache_index() excludes the root_mem_cgroup because the
memcg_idx of it is -1.
So it can reclaim global memory even if the list is memcg aware.
Is that right ?

Thanks
Yafang
Dave Chinner Jan. 4, 2020, 9:23 p.m. UTC | #3
On Sat, Jan 04, 2020 at 03:26:13PM +0800, Yafang Shao wrote:
> On Sat, Jan 4, 2020 at 11:36 AM Dave Chinner <david@fromorbit.com> wrote:
> >
> > On Tue, Dec 24, 2019 at 02:53:25AM -0500, Yafang Shao wrote:
> > > The lru walker isolation function may use this memcg to do something, e.g.
> > > the inode isolatation function will use the memcg to do inode protection in
> > > followup patch. So make memcg visible to the lru walker isolation function.
> > >
> > > Something should be emphasized in this patch is it replaces
> > > for_each_memcg_cache_index() with for_each_mem_cgroup() in
> > > list_lru_walk_node(). Because there's a gap between these two MACROs that
> > > for_each_mem_cgroup() depends on CONFIG_MEMCG while the other one depends
> > > on CONFIG_MEMCG_KMEM. But as list_lru_memcg_aware() returns false if
> > > CONFIG_MEMCG_KMEM is not configured, it is safe to this replacement.
> > >
> > > Cc: Dave Chinner <dchinner@redhat.com>
> > > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> >
> > ....
> >
> > > @@ -299,17 +299,15 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
> > >                                list_lru_walk_cb isolate, void *cb_arg,
> > >                                unsigned long *nr_to_walk)
> > >  {
> > > +     struct mem_cgroup *memcg;
> > >       long isolated = 0;
> > > -     int memcg_idx;
> > >
> > > -     isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
> > > -                                   nr_to_walk);
> > > -     if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
> > > -             for_each_memcg_cache_index(memcg_idx) {
> > > +     if (list_lru_memcg_aware(lru)) {
> > > +             for_each_mem_cgroup(memcg) {
> > >                       struct list_lru_node *nlru = &lru->node[nid];
> > >
> > >                       spin_lock(&nlru->lock);
> > > -                     isolated += __list_lru_walk_one(nlru, memcg_idx,
> > > +                     isolated += __list_lru_walk_one(nlru, memcg,
> > >                                                       isolate, cb_arg,
> > >                                                       nr_to_walk);
> > >                       spin_unlock(&nlru->lock);
> > > @@ -317,7 +315,11 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
> > >                       if (*nr_to_walk <= 0)
> > >                               break;
> > >               }
> > > +     } else {
> > > +             isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
> > > +                                           nr_to_walk);
> > >       }
> > > +
> >
> > That's a change of behaviour. The old code always runs per-node
> > reclaim, then if the LRU is memcg aware it also runs the memcg
> > aware reclaim. The new code never runs global per-node reclaim
> > if the list is memcg aware, so shrinkers that are initialised
> > with the flags SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE seem
> > likely to have reclaim problems with mixed memcg/global memory
> > pressure scenarios.
> >
> > e.g. if all the memory is in the per-node lists, and the memcg needs
> > to reclaim memory because of a global shortage, it is now unable to
> > reclaim global memory.....
> >
> 
> Hi Dave,
> 
> Thanks for your detailed explanation.
> But I have different understanding.
> The difference between for_each_mem_cgroup(memcg) and
> for_each_memcg_cache_index(memcg_idx) is that the
> for_each_mem_cgroup() includes the root_mem_cgroup while the
> for_each_memcg_cache_index() excludes the root_mem_cgroup because the
> memcg_idx of it is -1.

Except that the "root" memcg that for_each_mem_cgroup() is not the
"global root" memcg - it is whatever memcg that is passed down in
the shrink_control, whereever that sits in the cgroup tree heirarchy.
do_shrink_slab() only ever passes down the global root memcg to the
shrinkers when the global root memcg is passed to shrink_slab(), and
that does not iterate the memcg heirarchy - it just wants to
reclaim from global caches an non-memcg aware shrinkers.

IOWs, there are multiple changes in behaviour here - memcg specific
reclaim won't do global reclaim, and global reclaim will now iterate
all memcgs instead of just the global root memcg.

> So it can reclaim global memory even if the list is memcg aware.
> Is that right ?

If the memcg passed to this fucntion is the root memcg, then yes,
it will behave as you suggest. But for the majority of memcg-context
reclaim, the memcg is not the root memcg and so they will not do
global reclaim anymore...

Cheers,

Dave.
Yafang Shao Jan. 5, 2020, 1:43 a.m. UTC | #4
On Sun, Jan 5, 2020 at 5:23 AM Dave Chinner <david@fromorbit.com> wrote:
>
> On Sat, Jan 04, 2020 at 03:26:13PM +0800, Yafang Shao wrote:
> > On Sat, Jan 4, 2020 at 11:36 AM Dave Chinner <david@fromorbit.com> wrote:
> > >
> > > On Tue, Dec 24, 2019 at 02:53:25AM -0500, Yafang Shao wrote:
> > > > The lru walker isolation function may use this memcg to do something, e.g.
> > > > the inode isolatation function will use the memcg to do inode protection in
> > > > followup patch. So make memcg visible to the lru walker isolation function.
> > > >
> > > > Something should be emphasized in this patch is it replaces
> > > > for_each_memcg_cache_index() with for_each_mem_cgroup() in
> > > > list_lru_walk_node(). Because there's a gap between these two MACROs that
> > > > for_each_mem_cgroup() depends on CONFIG_MEMCG while the other one depends
> > > > on CONFIG_MEMCG_KMEM. But as list_lru_memcg_aware() returns false if
> > > > CONFIG_MEMCG_KMEM is not configured, it is safe to this replacement.
> > > >
> > > > Cc: Dave Chinner <dchinner@redhat.com>
> > > > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> > >
> > > ....
> > >
> > > > @@ -299,17 +299,15 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
> > > >                                list_lru_walk_cb isolate, void *cb_arg,
> > > >                                unsigned long *nr_to_walk)
> > > >  {
> > > > +     struct mem_cgroup *memcg;
> > > >       long isolated = 0;
> > > > -     int memcg_idx;
> > > >
> > > > -     isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
> > > > -                                   nr_to_walk);
> > > > -     if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
> > > > -             for_each_memcg_cache_index(memcg_idx) {
> > > > +     if (list_lru_memcg_aware(lru)) {
> > > > +             for_each_mem_cgroup(memcg) {
> > > >                       struct list_lru_node *nlru = &lru->node[nid];
> > > >
> > > >                       spin_lock(&nlru->lock);
> > > > -                     isolated += __list_lru_walk_one(nlru, memcg_idx,
> > > > +                     isolated += __list_lru_walk_one(nlru, memcg,
> > > >                                                       isolate, cb_arg,
> > > >                                                       nr_to_walk);
> > > >                       spin_unlock(&nlru->lock);
> > > > @@ -317,7 +315,11 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
> > > >                       if (*nr_to_walk <= 0)
> > > >                               break;
> > > >               }
> > > > +     } else {
> > > > +             isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
> > > > +                                           nr_to_walk);
> > > >       }
> > > > +
> > >
> > > That's a change of behaviour. The old code always runs per-node
> > > reclaim, then if the LRU is memcg aware it also runs the memcg
> > > aware reclaim. The new code never runs global per-node reclaim
> > > if the list is memcg aware, so shrinkers that are initialised
> > > with the flags SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE seem
> > > likely to have reclaim problems with mixed memcg/global memory
> > > pressure scenarios.
> > >
> > > e.g. if all the memory is in the per-node lists, and the memcg needs
> > > to reclaim memory because of a global shortage, it is now unable to
> > > reclaim global memory.....
> > >
> >
> > Hi Dave,
> >
> > Thanks for your detailed explanation.
> > But I have different understanding.
> > The difference between for_each_mem_cgroup(memcg) and
> > for_each_memcg_cache_index(memcg_idx) is that the
> > for_each_mem_cgroup() includes the root_mem_cgroup while the
> > for_each_memcg_cache_index() excludes the root_mem_cgroup because the
> > memcg_idx of it is -1.
>
> Except that the "root" memcg that for_each_mem_cgroup() is not the
> "global root" memcg - it is whatever memcg that is passed down in
> the shrink_control, whereever that sits in the cgroup tree heirarchy.
> do_shrink_slab() only ever passes down the global root memcg to the
> shrinkers when the global root memcg is passed to shrink_slab(), and
> that does not iterate the memcg heirarchy - it just wants to
> reclaim from global caches an non-memcg aware shrinkers.
>
> IOWs, there are multiple changes in behaviour here - memcg specific
> reclaim won't do global reclaim, and global reclaim will now iterate
> all memcgs instead of just the global root memcg.
>
> > So it can reclaim global memory even if the list is memcg aware.
> > Is that right ?
>
> If the memcg passed to this fucntion is the root memcg, then yes,
> it will behave as you suggest. But for the majority of memcg-context
> reclaim, the memcg is not the root memcg and so they will not do
> global reclaim anymore...
>

Thanks for you reply.
But I have to clairfy that this change is in list_lru_walk_node(), and
the memcg is not passed to this funtion from shrink_control.
In order to make it more clear, I paste the function here.

- The new function
unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
                                 list_lru_walk_cb isolate, void *cb_arg,
                                 unsigned long *nr_to_walk)
{
        struct mem_cgroup *memcg;   <<<< A local variable
        long isolated = 0;

        if (list_lru_memcg_aware(lru)) {
                for_each_mem_cgroup(memcg) {  <<<<  scan all MEMCGs,
including root_mem_cgroup
                        struct list_lru_node *nlru = &lru->node[nid];

                        spin_lock(&nlru->lock);
                        isolated += __list_lru_walk_one(nlru, memcg,
                                                        isolate, cb_arg,
                                                        nr_to_walk);
                        spin_unlock(&nlru->lock);

                        if (*nr_to_walk <= 0)
                                break;
                }
        } else {
<<<<  scan global memory only (root_mem_cgroup)
                isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
                                              nr_to_walk);
        }

        return isolated;
}

- While the original function is,

unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
                                 list_lru_walk_cb isolate, void *cb_arg,
                                 unsigned long *nr_to_walk)
{
        long isolated = 0;
        int memcg_idx;

        isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
                                      nr_to_walk);
            <<<<  scan global memory only (root_mem_cgroup)
        if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
                for_each_memcg_cache_index(memcg_idx) {   <<<< scan
all MEMCGs excludes root_mem_cgroup
                        struct list_lru_node *nlru = &lru->node[nid];

                        spin_lock(&nlru->lock);
                        isolated += __list_lru_walk_one(nlru, memcg_idx,
                                                        isolate, cb_arg,
                                                        nr_to_walk);
                        spin_unlock(&nlru->lock);

                        if (*nr_to_walk <= 0)
                                break;
                }
        }

        return isolated;
}

Is that right ?

Thanks
Yafang
Dave Chinner Jan. 6, 2020, 12:17 a.m. UTC | #5
On Sun, Jan 05, 2020 at 09:43:11AM +0800, Yafang Shao wrote:
> On Sun, Jan 5, 2020 at 5:23 AM Dave Chinner <david@fromorbit.com> wrote:
> > On Sat, Jan 04, 2020 at 03:26:13PM +0800, Yafang Shao wrote:
> > > On Sat, Jan 4, 2020 at 11:36 AM Dave Chinner <david@fromorbit.com> wrote:
> > > > On Tue, Dec 24, 2019 at 02:53:25AM -0500, Yafang Shao wrote:
> > > > > The lru walker isolation function may use this memcg to do something, e.g.
> > > > > the inode isolatation function will use the memcg to do inode protection in
> > > > > followup patch. So make memcg visible to the lru walker isolation function.
> > > > >
> > > > > Something should be emphasized in this patch is it replaces
> > > > > for_each_memcg_cache_index() with for_each_mem_cgroup() in
> > > > > list_lru_walk_node(). Because there's a gap between these two MACROs that
> > > > > for_each_mem_cgroup() depends on CONFIG_MEMCG while the other one depends
> > > > > on CONFIG_MEMCG_KMEM. But as list_lru_memcg_aware() returns false if
> > > > > CONFIG_MEMCG_KMEM is not configured, it is safe to this replacement.
> > > > >
> > > > > Cc: Dave Chinner <dchinner@redhat.com>
> > > > > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> > > >
> > > > ....
> > > >
> > > > > @@ -299,17 +299,15 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
> > > > >                                list_lru_walk_cb isolate, void *cb_arg,
> > > > >                                unsigned long *nr_to_walk)
> > > > >  {
> > > > > +     struct mem_cgroup *memcg;
> > > > >       long isolated = 0;
> > > > > -     int memcg_idx;
> > > > >
> > > > > -     isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
> > > > > -                                   nr_to_walk);
> > > > > -     if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
> > > > > -             for_each_memcg_cache_index(memcg_idx) {
> > > > > +     if (list_lru_memcg_aware(lru)) {
> > > > > +             for_each_mem_cgroup(memcg) {
> > > > >                       struct list_lru_node *nlru = &lru->node[nid];
> > > > >
> > > > >                       spin_lock(&nlru->lock);
> > > > > -                     isolated += __list_lru_walk_one(nlru, memcg_idx,
> > > > > +                     isolated += __list_lru_walk_one(nlru, memcg,
> > > > >                                                       isolate, cb_arg,
> > > > >                                                       nr_to_walk);
> > > > >                       spin_unlock(&nlru->lock);
> > > > > @@ -317,7 +315,11 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
> > > > >                       if (*nr_to_walk <= 0)
> > > > >                               break;
> > > > >               }
> > > > > +     } else {
> > > > > +             isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
> > > > > +                                           nr_to_walk);
> > > > >       }
> > > > > +
> > > >
> > > > That's a change of behaviour. The old code always runs per-node
> > > > reclaim, then if the LRU is memcg aware it also runs the memcg
> > > > aware reclaim. The new code never runs global per-node reclaim
> > > > if the list is memcg aware, so shrinkers that are initialised
> > > > with the flags SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE seem
> > > > likely to have reclaim problems with mixed memcg/global memory
> > > > pressure scenarios.
> > > >
> > > > e.g. if all the memory is in the per-node lists, and the memcg needs
> > > > to reclaim memory because of a global shortage, it is now unable to
> > > > reclaim global memory.....
> > > >
> > >
> > > Hi Dave,
> > >
> > > Thanks for your detailed explanation.
> > > But I have different understanding.
> > > The difference between for_each_mem_cgroup(memcg) and
> > > for_each_memcg_cache_index(memcg_idx) is that the
> > > for_each_mem_cgroup() includes the root_mem_cgroup while the
> > > for_each_memcg_cache_index() excludes the root_mem_cgroup because the
> > > memcg_idx of it is -1.
> >
> > Except that the "root" memcg that for_each_mem_cgroup() is not the
> > "global root" memcg - it is whatever memcg that is passed down in
> > the shrink_control, whereever that sits in the cgroup tree heirarchy.
> > do_shrink_slab() only ever passes down the global root memcg to the
> > shrinkers when the global root memcg is passed to shrink_slab(), and
> > that does not iterate the memcg heirarchy - it just wants to
> > reclaim from global caches an non-memcg aware shrinkers.
> >
> > IOWs, there are multiple changes in behaviour here - memcg specific
> > reclaim won't do global reclaim, and global reclaim will now iterate
> > all memcgs instead of just the global root memcg.
> >
> > > So it can reclaim global memory even if the list is memcg aware.
> > > Is that right ?
> >
> > If the memcg passed to this fucntion is the root memcg, then yes,
> > it will behave as you suggest. But for the majority of memcg-context
> > reclaim, the memcg is not the root memcg and so they will not do
> > global reclaim anymore...
> >
> 
> Thanks for you reply.
> But I have to clairfy that this change is in list_lru_walk_node(), and
> the memcg is not passed to this funtion from shrink_control.
> In order to make it more clear, I paste the function here.
> 
> - The new function
> unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
>                                  list_lru_walk_cb isolate, void *cb_arg,
>                                  unsigned long *nr_to_walk)
> {
>         struct mem_cgroup *memcg;   <<<< A local variable
>         long isolated = 0;
> 
>         if (list_lru_memcg_aware(lru)) {
>                 for_each_mem_cgroup(memcg) {  <<<<  scan all MEMCGs,
> including root_mem_cgroup

Oh, wait, this is list_lru_walk_node(), which is not even called by
the shrinker code - this is the "iterate every object" code
path. So this isn't even part of the code path this patchset needs
the memcg for.

<does some code spelunking>

Gawd, what a mess this memcg stuff is from when you look from the
outside. After digging, I find that the root memcg has a kmemcg_id =
-1. There is special case code everywhere that checks the id for >=
0, open codes memcg == root_mem_cgroup checks, calls
mem_cgroup_is_root(), etc to avoid doing things like using the root
memcg kmemcg_id as an array index. An in the case of the list_lru,
this means it doesn't use the memcg lru indexes at all (i.e.
lru->node[nid].memcg_lrus[memcg_idx], but quietly returns the global
list at lru->node[nid].lru when kmemcg_id < 0.

So, after a couple of hours of wading through the code, I finally
remember all the details of the existing code and understand how
this new code works - it relies entirely on the special casing of
the root memcg that makes it behave like memcgs aren't configured at
all. i.e. we select the global root lists when the root memcg is
passed around rather than using a memcg specific LRU list.

The old code used for_each_memcg_cache_index(), which never
scanned the root memcg:

#define for_each_memcg_cache_index(_idx)        \
        for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++)

because it starts at idx = 0, and the root memcg is -1. Hence the
old code always needed to scan the root memcg (the global lists)
manually itself, which made the code really nice and clear because
it was the same for both non-memcg and memcg aware lists. i.e. the
global scan was not hidden away inside the memcg scanning.

IOWs, I find the existing code is much easier to see the difference
between global and per-memcg iteration. IF we are keeping this
special "root memcg context is special" architecture (more on that
below), I'd prefer that the code keeps that distinction, because
having to understand how the root memcg is special cased just to
work out how this code iterates the global caches is a waste of time
and brain power. not every one really wants to know about memcg
internals....

We can clean up the code a lot by getting rid of the unnecessary
indenting by doing this:

	/* iterate the global lru first */
	isolated = list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
					nr_to_walk);
	if (!list_lru_memcg_aware(lru))
		return isolated;

	nlru = &lru->node[nid];
	for_each_mem_cgroup(memcg) {
		/* already scanned the root memcg above */
		if (is_root_memcg(memcg))
			continue;
		if (*nr_to_walk <= 0)
			break;
		spin_lock(&nlru->lock);
		isolated += __list_lru_walk_one(nlru, memcg,
						isolate, cb_arg,
						nr_to_walk);
		spin_unlock(&nlru->lock);
	}
	return isolated;


And so to architecture... This all makes me wonder why we still
special case the memcg LRU lists here. Ever since we went to
per-node memcg lru lists (~2015, iirc), there's been this special
hidden hack for the root memcg to use the global list. and one that
I have to read lots of code to remind myself it exists every time I
have to did into this code again.

I mean, if the list is not memcg aware, then it only needs a single
list per node - the root memcg list. That could be easily stored in
the memcg_lrus array for the node rather than a separate global
list, and then the node iteration code all starts to look like this:

	nlru = &lru->node[nid];
	for_each_mem_cgroup(memcg) {
		spin_lock(&nlru->lock);
		isolated += __list_lru_walk_one(nlru, memcg,
						isolate, cb_arg,
						nr_to_walk);
		spin_unlock(&nlru->lock);
		if (*nr_to_walk <= 0)
			break;

		/* non-memcg aware lists only have a root memcg list */
		if (!list_lru_memcg_aware(lru))
			break;
	}

Hence for the walks in the !list_lru_memcg_aware(lru) case, the
list_lru_from_memcg() call in __list_lru_walk_one() always returns
just the root list. This makes everything use the same iteration
interface, and when you configure out memcg then we simply code the
the iterator to run once and list_lru_from_memcg() always returns
the one list...

i.e. the whole reason for having the separate global and per-memcg
LRU lists went away back in 2015 when it was decided that we don't
care about the (potentially massive) memory usage of per-node memcg
LRU lists. However, the special casing of the root memcg was left in
place, but what your patch says to me is that we should really be
trying to make this special casing go away.

Moving everything in the list-lru to the memcg_lrus also means that
the global shrinker lists get the "objects in the lru to scan for
reclaim" shrinker bitmaps that optimise the high level shrinker
code. It means we could get rid of the separate global vs memcg
shrinker paths in vmscan.c - we just pass the root memcg into
shrink_slab_memcg() when we want to do global shrinker reclaim...

IMO, if memcgs are fundamental to the operation of systems these
days such that there the root memcg is treated no differently to all
other memcgs, then let's architect the whole stack that way, not
just make the code hard to read by having little bits of code hide
the special casing of the root memcg layers deep where it's raelly
hard to find....

And, FWIW, I noticed that the list_lru memcg code assumes we only
ever put objects from memcg associated slab pages in the list_lru.
It calls memcg_from_slab_page() which makes no attempt to verify the
page is actually a slab page. That's a landmine just waiting to get
boom - list-lru can store any type of object the user wants, not
just slab pages. e.g. the binder code stores pages in the list-lru,
not slab objects, and so the only reason it doesn't go boom is that
the lru-list is not configured to be memcg aware....

Cheers,

Dave.
Yafang Shao Jan. 6, 2020, 2:41 p.m. UTC | #6
On Mon, Jan 6, 2020 at 8:17 AM Dave Chinner <david@fromorbit.com> wrote:
>
> On Sun, Jan 05, 2020 at 09:43:11AM +0800, Yafang Shao wrote:
> > On Sun, Jan 5, 2020 at 5:23 AM Dave Chinner <david@fromorbit.com> wrote:
> > > On Sat, Jan 04, 2020 at 03:26:13PM +0800, Yafang Shao wrote:
> > > > On Sat, Jan 4, 2020 at 11:36 AM Dave Chinner <david@fromorbit.com> wrote:
> > > > > On Tue, Dec 24, 2019 at 02:53:25AM -0500, Yafang Shao wrote:
> > > > > > The lru walker isolation function may use this memcg to do something, e.g.
> > > > > > the inode isolatation function will use the memcg to do inode protection in
> > > > > > followup patch. So make memcg visible to the lru walker isolation function.
> > > > > >
> > > > > > Something should be emphasized in this patch is it replaces
> > > > > > for_each_memcg_cache_index() with for_each_mem_cgroup() in
> > > > > > list_lru_walk_node(). Because there's a gap between these two MACROs that
> > > > > > for_each_mem_cgroup() depends on CONFIG_MEMCG while the other one depends
> > > > > > on CONFIG_MEMCG_KMEM. But as list_lru_memcg_aware() returns false if
> > > > > > CONFIG_MEMCG_KMEM is not configured, it is safe to this replacement.
> > > > > >
> > > > > > Cc: Dave Chinner <dchinner@redhat.com>
> > > > > > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> > > > >
> > > > > ....
> > > > >
> > > > > > @@ -299,17 +299,15 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
> > > > > >                                list_lru_walk_cb isolate, void *cb_arg,
> > > > > >                                unsigned long *nr_to_walk)
> > > > > >  {
> > > > > > +     struct mem_cgroup *memcg;
> > > > > >       long isolated = 0;
> > > > > > -     int memcg_idx;
> > > > > >
> > > > > > -     isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
> > > > > > -                                   nr_to_walk);
> > > > > > -     if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
> > > > > > -             for_each_memcg_cache_index(memcg_idx) {
> > > > > > +     if (list_lru_memcg_aware(lru)) {
> > > > > > +             for_each_mem_cgroup(memcg) {
> > > > > >                       struct list_lru_node *nlru = &lru->node[nid];
> > > > > >
> > > > > >                       spin_lock(&nlru->lock);
> > > > > > -                     isolated += __list_lru_walk_one(nlru, memcg_idx,
> > > > > > +                     isolated += __list_lru_walk_one(nlru, memcg,
> > > > > >                                                       isolate, cb_arg,
> > > > > >                                                       nr_to_walk);
> > > > > >                       spin_unlock(&nlru->lock);
> > > > > > @@ -317,7 +315,11 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
> > > > > >                       if (*nr_to_walk <= 0)
> > > > > >                               break;
> > > > > >               }
> > > > > > +     } else {
> > > > > > +             isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
> > > > > > +                                           nr_to_walk);
> > > > > >       }
> > > > > > +
> > > > >
> > > > > That's a change of behaviour. The old code always runs per-node
> > > > > reclaim, then if the LRU is memcg aware it also runs the memcg
> > > > > aware reclaim. The new code never runs global per-node reclaim
> > > > > if the list is memcg aware, so shrinkers that are initialised
> > > > > with the flags SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE seem
> > > > > likely to have reclaim problems with mixed memcg/global memory
> > > > > pressure scenarios.
> > > > >
> > > > > e.g. if all the memory is in the per-node lists, and the memcg needs
> > > > > to reclaim memory because of a global shortage, it is now unable to
> > > > > reclaim global memory.....
> > > > >
> > > >
> > > > Hi Dave,
> > > >
> > > > Thanks for your detailed explanation.
> > > > But I have different understanding.
> > > > The difference between for_each_mem_cgroup(memcg) and
> > > > for_each_memcg_cache_index(memcg_idx) is that the
> > > > for_each_mem_cgroup() includes the root_mem_cgroup while the
> > > > for_each_memcg_cache_index() excludes the root_mem_cgroup because the
> > > > memcg_idx of it is -1.
> > >
> > > Except that the "root" memcg that for_each_mem_cgroup() is not the
> > > "global root" memcg - it is whatever memcg that is passed down in
> > > the shrink_control, whereever that sits in the cgroup tree heirarchy.
> > > do_shrink_slab() only ever passes down the global root memcg to the
> > > shrinkers when the global root memcg is passed to shrink_slab(), and
> > > that does not iterate the memcg heirarchy - it just wants to
> > > reclaim from global caches an non-memcg aware shrinkers.
> > >
> > > IOWs, there are multiple changes in behaviour here - memcg specific
> > > reclaim won't do global reclaim, and global reclaim will now iterate
> > > all memcgs instead of just the global root memcg.
> > >
> > > > So it can reclaim global memory even if the list is memcg aware.
> > > > Is that right ?
> > >
> > > If the memcg passed to this fucntion is the root memcg, then yes,
> > > it will behave as you suggest. But for the majority of memcg-context
> > > reclaim, the memcg is not the root memcg and so they will not do
> > > global reclaim anymore...
> > >
> >
> > Thanks for you reply.
> > But I have to clairfy that this change is in list_lru_walk_node(), and
> > the memcg is not passed to this funtion from shrink_control.
> > In order to make it more clear, I paste the function here.
> >
> > - The new function
> > unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
> >                                  list_lru_walk_cb isolate, void *cb_arg,
> >                                  unsigned long *nr_to_walk)
> > {
> >         struct mem_cgroup *memcg;   <<<< A local variable
> >         long isolated = 0;
> >
> >         if (list_lru_memcg_aware(lru)) {
> >                 for_each_mem_cgroup(memcg) {  <<<<  scan all MEMCGs,
> > including root_mem_cgroup
>
> Oh, wait, this is list_lru_walk_node(), which is not even called by
> the shrinker code - this is the "iterate every object" code
> path.

Right.

> So this isn't even part of the code path this patchset needs
> the memcg for.
>
> <does some code spelunking>
>
> Gawd, what a mess this memcg stuff is from when you look from the
> outside. After digging, I find that the root memcg has a kmemcg_id =
> -1. There is special case code everywhere that checks the id for >=
> 0, open codes memcg == root_mem_cgroup checks, calls
> mem_cgroup_is_root(), etc to avoid doing things like using the root
> memcg kmemcg_id as an array index. An in the case of the list_lru,
> this means it doesn't use the memcg lru indexes at all (i.e.
> lru->node[nid].memcg_lrus[memcg_idx], but quietly returns the global
> list at lru->node[nid].lru when kmemcg_id < 0.
>
> So, after a couple of hours of wading through the code, I finally
> remember all the details of the existing code and understand how
> this new code works - it relies entirely on the special casing of
> the root memcg that makes it behave like memcgs aren't configured at
> all. i.e. we select the global root lists when the root memcg is
> passed around rather than using a memcg specific LRU list.
>
> The old code used for_each_memcg_cache_index(), which never
> scanned the root memcg:
>
> #define for_each_memcg_cache_index(_idx)        \
>         for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++)
>
> because it starts at idx = 0, and the root memcg is -1. Hence the
> old code always needed to scan the root memcg (the global lists)
> manually itself, which made the code really nice and clear because
> it was the same for both non-memcg and memcg aware lists. i.e. the
> global scan was not hidden away inside the memcg scanning.
>
> IOWs, I find the existing code is much easier to see the difference
> between global and per-memcg iteration. IF we are keeping this
> special "root memcg context is special" architecture (more on that
> below), I'd prefer that the code keeps that distinction, because
> having to understand how the root memcg is special cased just to
> work out how this code iterates the global caches is a waste of time
> and brain power. not every one really wants to know about memcg
> internals....
>
> We can clean up the code a lot by getting rid of the unnecessary
> indenting by doing this:
>
>         /* iterate the global lru first */
>         isolated = list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
>                                         nr_to_walk);
>         if (!list_lru_memcg_aware(lru))
>                 return isolated;
>
>         nlru = &lru->node[nid];
>         for_each_mem_cgroup(memcg) {
>                 /* already scanned the root memcg above */
>                 if (is_root_memcg(memcg))
>                         continue;
>                 if (*nr_to_walk <= 0)
>                         break;
>                 spin_lock(&nlru->lock);
>                 isolated += __list_lru_walk_one(nlru, memcg,
>                                                 isolate, cb_arg,
>                                                 nr_to_walk);
>                 spin_unlock(&nlru->lock);
>         }
>         return isolated;
>

That's eaiser to understand.
I wil change it like this in next version.

>
> And so to architecture... This all makes me wonder why we still
> special case the memcg LRU lists here.

Can't agree more.
The first time I read this code, I wondered why not assign an
non-negtive number to kmemcg_id of the root_mem_cgroup and then use
memcg_lrus as well.


> Ever since we went to
> per-node memcg lru lists (~2015, iirc), there's been this special
> hidden hack for the root memcg to use the global list. and one that
> I have to read lots of code to remind myself it exists every time I
> have to did into this code again.
>
> I mean, if the list is not memcg aware, then it only needs a single
> list per node - the root memcg list. That could be easily stored in
> the memcg_lrus array for the node rather than a separate global
> list, and then the node iteration code all starts to look like this:
>
>         nlru = &lru->node[nid];
>         for_each_mem_cgroup(memcg) {
>                 spin_lock(&nlru->lock);
>                 isolated += __list_lru_walk_one(nlru, memcg,
>                                                 isolate, cb_arg,
>                                                 nr_to_walk);
>                 spin_unlock(&nlru->lock);
>                 if (*nr_to_walk <= 0)
>                         break;
>
>                 /* non-memcg aware lists only have a root memcg list */
>                 if (!list_lru_memcg_aware(lru))
>                         break;
>         }
>
> Hence for the walks in the !list_lru_memcg_aware(lru) case, the
> list_lru_from_memcg() call in __list_lru_walk_one() always returns
> just the root list. This makes everything use the same iteration
> interface, and when you configure out memcg then we simply code the
> the iterator to run once and list_lru_from_memcg() always returns
> the one list...
>

Agree with you that it is a better architecture, and I also want to
change it like this.
That would be more clear and easier for maintiance.
But it requires lots of code changes, should we address it in another
separate patchset ?


> i.e. the whole reason for having the separate global and per-memcg
> LRU lists went away back in 2015 when it was decided that we don't
> care about the (potentially massive) memory usage of per-node memcg
> LRU lists. However, the special casing of the root memcg was left in
> place, but what your patch says to me is that we should really be
> trying to make this special casing go away.
>
> Moving everything in the list-lru to the memcg_lrus also means that
> the global shrinker lists get the "objects in the lru to scan for
> reclaim" shrinker bitmaps that optimise the high level shrinker
> code. It means we could get rid of the separate global vs memcg
> shrinker paths in vmscan.c - we just pass the root memcg into
> shrink_slab_memcg() when we want to do global shrinker reclaim...
>
> IMO, if memcgs are fundamental to the operation of systems these
> days such that there the root memcg is treated no differently to all
> other memcgs, then let's architect the whole stack that way, not
> just make the code hard to read by having little bits of code hide
> the special casing of the root memcg layers deep where it's raelly
> hard to find....
>

Agree. I will try to improve it and submit a new patchset for this
architect improvement.

> And, FWIW, I noticed that the list_lru memcg code assumes we only
> ever put objects from memcg associated slab pages in the list_lru.
> It calls memcg_from_slab_page() which makes no attempt to verify the
> page is actually a slab page. That's a landmine just waiting to get
> boom - list-lru can store any type of object the user wants, not
> just slab pages. e.g. the binder code stores pages in the list-lru,
> not slab objects, and so the only reason it doesn't go boom is that
> the lru-list is not configured to be memcg aware....
>

Another new issue.
I will try to dignose what hiddened in this landmine is, and after I
understand it clearly I will submit a new patchset.

Thanks
Yafang
Dave Chinner Jan. 6, 2020, 9:31 p.m. UTC | #7
On Mon, Jan 06, 2020 at 10:41:22PM +0800, Yafang Shao wrote:
> On Mon, Jan 6, 2020 at 8:17 AM Dave Chinner <david@fromorbit.com> wrote:
> > We can clean up the code a lot by getting rid of the unnecessary
> > indenting by doing this:
> >
> >         /* iterate the global lru first */
> >         isolated = list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
> >                                         nr_to_walk);
> >         if (!list_lru_memcg_aware(lru))
> >                 return isolated;
> >
> >         nlru = &lru->node[nid];
> >         for_each_mem_cgroup(memcg) {
> >                 /* already scanned the root memcg above */
> >                 if (is_root_memcg(memcg))
> >                         continue;
> >                 if (*nr_to_walk <= 0)
> >                         break;
> >                 spin_lock(&nlru->lock);
> >                 isolated += __list_lru_walk_one(nlru, memcg,
> >                                                 isolate, cb_arg,
> >                                                 nr_to_walk);
> >                 spin_unlock(&nlru->lock);
> >         }
> >         return isolated;
> >
> 
> That's eaiser to understand.
> I wil change it like this in next version.

Thanks!

> >
> > And so to architecture... This all makes me wonder why we still
> > special case the memcg LRU lists here.
> 
> Can't agree more.
> The first time I read this code, I wondered why not assign an
> non-negtive number to kmemcg_id of the root_mem_cgroup and then use
> memcg_lrus as well.

Yeah, I've been wondering why the ID is -1 instead of 0 when we
have a global variable that stores the root memcg that we can
compare pointers directly against via is_root_memcg(). all it seems
to do is make things more complex by having to special case the root
memcg....

From that perspective, I do like your change to use the memcg
iterator functions rather than a for loop over the range of indexes,
but I'd much prefer to see this method used consistently everywhere
rather than the way we've duplicated lots of code by tacking memcgs
onto the side of the non-memcg code paths.

> > Ever since we went to
> > per-node memcg lru lists (~2015, iirc), there's been this special
> > hidden hack for the root memcg to use the global list. and one that
> > I have to read lots of code to remind myself it exists every time I
> > have to did into this code again.
> >
> > I mean, if the list is not memcg aware, then it only needs a single
> > list per node - the root memcg list. That could be easily stored in
> > the memcg_lrus array for the node rather than a separate global
> > list, and then the node iteration code all starts to look like this:
> >
> >         nlru = &lru->node[nid];
> >         for_each_mem_cgroup(memcg) {
> >                 spin_lock(&nlru->lock);
> >                 isolated += __list_lru_walk_one(nlru, memcg,
> >                                                 isolate, cb_arg,
> >                                                 nr_to_walk);
> >                 spin_unlock(&nlru->lock);
> >                 if (*nr_to_walk <= 0)
> >                         break;
> >
> >                 /* non-memcg aware lists only have a root memcg list */
> >                 if (!list_lru_memcg_aware(lru))
> >                         break;
> >         }
> >
> > Hence for the walks in the !list_lru_memcg_aware(lru) case, the
> > list_lru_from_memcg() call in __list_lru_walk_one() always returns
> > just the root list. This makes everything use the same iteration
> > interface, and when you configure out memcg then we simply code the
> > the iterator to run once and list_lru_from_memcg() always returns
> > the one list...
> >
> 
> Agree with you that it is a better architecture, and I also want to
> change it like this.
> That would be more clear and easier for maintiance.
> But it requires lots of code changes, should we address it in another
> separate patchset ?

Yes. I think this is a separate piece of work as it spans much more
than just the list-lru infrastructure.

> > And, FWIW, I noticed that the list_lru memcg code assumes we only
> > ever put objects from memcg associated slab pages in the list_lru.
> > It calls memcg_from_slab_page() which makes no attempt to verify the
> > page is actually a slab page. That's a landmine just waiting to get
> > boom - list-lru can store any type of object the user wants, not
> > just slab pages. e.g. the binder code stores pages in the list-lru,
> > not slab objects, and so the only reason it doesn't go boom is that
> > the lru-list is not configured to be memcg aware....
> >
> 
> Another new issue.
> I will try to dignose what hiddened in this landmine is, and after I
> understand it clearly I will submit a new patchset.

The problem is memcg_from_slab_page() uses page->slab_cache directly
to retreive the owner memcg without first checking the
PageSlab(page) flag. If it's not a slab page, we need to get the
memcg from page->memcg, not page->slab_cache->memcg_params.memcg.

see page_cgroup_ino() for how to safely get the owner memcg from a
random page of unknown type...

Cheers,

Dave.
Yafang Shao Jan. 7, 2020, 1:22 p.m. UTC | #8
On Tue, Jan 7, 2020 at 5:31 AM Dave Chinner <david@fromorbit.com> wrote:
>
> On Mon, Jan 06, 2020 at 10:41:22PM +0800, Yafang Shao wrote:
> > On Mon, Jan 6, 2020 at 8:17 AM Dave Chinner <david@fromorbit.com> wrote:
> > > We can clean up the code a lot by getting rid of the unnecessary
> > > indenting by doing this:
> > >
> > >         /* iterate the global lru first */
> > >         isolated = list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
> > >                                         nr_to_walk);
> > >         if (!list_lru_memcg_aware(lru))
> > >                 return isolated;
> > >
> > >         nlru = &lru->node[nid];
> > >         for_each_mem_cgroup(memcg) {
> > >                 /* already scanned the root memcg above */
> > >                 if (is_root_memcg(memcg))
> > >                         continue;
> > >                 if (*nr_to_walk <= 0)
> > >                         break;
> > >                 spin_lock(&nlru->lock);
> > >                 isolated += __list_lru_walk_one(nlru, memcg,
> > >                                                 isolate, cb_arg,
> > >                                                 nr_to_walk);
> > >                 spin_unlock(&nlru->lock);
> > >         }
> > >         return isolated;
> > >
> >
> > That's eaiser to understand.
> > I wil change it like this in next version.
>
> Thanks!
>
> > >
> > > And so to architecture... This all makes me wonder why we still
> > > special case the memcg LRU lists here.
> >
> > Can't agree more.
> > The first time I read this code, I wondered why not assign an
> > non-negtive number to kmemcg_id of the root_mem_cgroup and then use
> > memcg_lrus as well.
>
> Yeah, I've been wondering why the ID is -1 instead of 0 when we
> have a global variable that stores the root memcg that we can
> compare pointers directly against via is_root_memcg(). all it seems
> to do is make things more complex by having to special case the root
> memcg....
>
> From that perspective, I do like your change to use the memcg
> iterator functions rather than a for loop over the range of indexes,
> but I'd much prefer to see this method used consistently everywhere
> rather than the way we've duplicated lots of code by tacking memcgs
> onto the side of the non-memcg code paths.
>

Agreed, that would be better.
I will think about it.

> > > Ever since we went to
> > > per-node memcg lru lists (~2015, iirc), there's been this special
> > > hidden hack for the root memcg to use the global list. and one that
> > > I have to read lots of code to remind myself it exists every time I
> > > have to did into this code again.
> > >
> > > I mean, if the list is not memcg aware, then it only needs a single
> > > list per node - the root memcg list. That could be easily stored in
> > > the memcg_lrus array for the node rather than a separate global
> > > list, and then the node iteration code all starts to look like this:
> > >
> > >         nlru = &lru->node[nid];
> > >         for_each_mem_cgroup(memcg) {
> > >                 spin_lock(&nlru->lock);
> > >                 isolated += __list_lru_walk_one(nlru, memcg,
> > >                                                 isolate, cb_arg,
> > >                                                 nr_to_walk);
> > >                 spin_unlock(&nlru->lock);
> > >                 if (*nr_to_walk <= 0)
> > >                         break;
> > >
> > >                 /* non-memcg aware lists only have a root memcg list */
> > >                 if (!list_lru_memcg_aware(lru))
> > >                         break;
> > >         }
> > >
> > > Hence for the walks in the !list_lru_memcg_aware(lru) case, the
> > > list_lru_from_memcg() call in __list_lru_walk_one() always returns
> > > just the root list. This makes everything use the same iteration
> > > interface, and when you configure out memcg then we simply code the
> > > the iterator to run once and list_lru_from_memcg() always returns
> > > the one list...
> > >
> >
> > Agree with you that it is a better architecture, and I also want to
> > change it like this.
> > That would be more clear and easier for maintiance.
> > But it requires lots of code changes, should we address it in another
> > separate patchset ?
>
> Yes. I think this is a separate piece of work as it spans much more
> than just the list-lru infrastructure.
>

Got it.

> > > And, FWIW, I noticed that the list_lru memcg code assumes we only
> > > ever put objects from memcg associated slab pages in the list_lru.
> > > It calls memcg_from_slab_page() which makes no attempt to verify the
> > > page is actually a slab page. That's a landmine just waiting to get
> > > boom - list-lru can store any type of object the user wants, not
> > > just slab pages. e.g. the binder code stores pages in the list-lru,
> > > not slab objects, and so the only reason it doesn't go boom is that
> > > the lru-list is not configured to be memcg aware....
> > >
> >
> > Another new issue.
> > I will try to dignose what hiddened in this landmine is, and after I
> > understand it clearly I will submit a new patchset.
>
> The problem is memcg_from_slab_page() uses page->slab_cache directly
> to retreive the owner memcg without first checking the
> PageSlab(page) flag. If it's not a slab page, we need to get the
> memcg from page->memcg, not page->slab_cache->memcg_params.memcg.
>
> see page_cgroup_ino() for how to safely get the owner memcg from a
> random page of unknown type...
>

Understood, many thanks for your explanation.


Thanks
Yafang
diff mbox series

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1a315c7..f36ada9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -449,6 +449,21 @@  struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
 int mem_cgroup_scan_tasks(struct mem_cgroup *,
 			  int (*)(struct task_struct *, void *), void *);
 
+/*
+ * Iteration constructs for visiting all cgroups (under a tree).  If
+ * loops are exited prematurely (break), mem_cgroup_iter_break() must
+ * be used for reference counting.
+ */
+#define for_each_mem_cgroup_tree(iter, root)		\
+	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
+	     iter != NULL;				\
+	     iter = mem_cgroup_iter(root, iter, NULL))
+
+#define for_each_mem_cgroup(iter)			\
+	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
+	     iter != NULL;				\
+	     iter = mem_cgroup_iter(NULL, iter, NULL))
+
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 {
 	if (mem_cgroup_disabled())
@@ -949,6 +964,12 @@  static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
 	return 0;
 }
 
+#define for_each_mem_cgroup_tree(iter)		\
+	for (iter = NULL; iter; )
+
+#define for_each_mem_cgroup(iter)		\
+	for (iter = NULL; iter; )
+
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 {
 	return 0;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 0f1f6b0..536830d 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -207,11 +207,11 @@  unsigned long list_lru_count_node(struct list_lru *lru, int nid)
 EXPORT_SYMBOL_GPL(list_lru_count_node);
 
 static unsigned long
-__list_lru_walk_one(struct list_lru_node *nlru, int memcg_idx,
+__list_lru_walk_one(struct list_lru_node *nlru, struct mem_cgroup *memcg,
 		    list_lru_walk_cb isolate, void *cb_arg,
 		    unsigned long *nr_to_walk)
 {
-
+	int memcg_idx = memcg_cache_id(memcg);
 	struct list_lru_one *l;
 	struct list_head *item, *n;
 	unsigned long isolated = 0;
@@ -273,7 +273,7 @@  unsigned long list_lru_count_node(struct list_lru *lru, int nid)
 	unsigned long ret;
 
 	spin_lock(&nlru->lock);
-	ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
+	ret = __list_lru_walk_one(nlru, memcg, isolate, cb_arg,
 				  nr_to_walk);
 	spin_unlock(&nlru->lock);
 	return ret;
@@ -289,7 +289,7 @@  unsigned long list_lru_count_node(struct list_lru *lru, int nid)
 	unsigned long ret;
 
 	spin_lock_irq(&nlru->lock);
-	ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
+	ret = __list_lru_walk_one(nlru, memcg, isolate, cb_arg,
 				  nr_to_walk);
 	spin_unlock_irq(&nlru->lock);
 	return ret;
@@ -299,17 +299,15 @@  unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
 				 list_lru_walk_cb isolate, void *cb_arg,
 				 unsigned long *nr_to_walk)
 {
+	struct mem_cgroup *memcg;
 	long isolated = 0;
-	int memcg_idx;
 
-	isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
-				      nr_to_walk);
-	if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
-		for_each_memcg_cache_index(memcg_idx) {
+	if (list_lru_memcg_aware(lru)) {
+		for_each_mem_cgroup(memcg) {
 			struct list_lru_node *nlru = &lru->node[nid];
 
 			spin_lock(&nlru->lock);
-			isolated += __list_lru_walk_one(nlru, memcg_idx,
+			isolated += __list_lru_walk_one(nlru, memcg,
 							isolate, cb_arg,
 							nr_to_walk);
 			spin_unlock(&nlru->lock);
@@ -317,7 +315,11 @@  unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
 			if (*nr_to_walk <= 0)
 				break;
 		}
+	} else {
+		isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
+					      nr_to_walk);
 	}
+
 	return isolated;
 }
 EXPORT_SYMBOL_GPL(list_lru_walk_node);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2e78931..2fc2bf4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -222,21 +222,6 @@  enum res_type {
 /* Used for OOM nofiier */
 #define OOM_CONTROL		(0)
 
-/*
- * Iteration constructs for visiting all cgroups (under a tree).  If
- * loops are exited prematurely (break), mem_cgroup_iter_break() must
- * be used for reference counting.
- */
-#define for_each_mem_cgroup_tree(iter, root)		\
-	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
-	     iter != NULL;				\
-	     iter = mem_cgroup_iter(root, iter, NULL))
-
-#define for_each_mem_cgroup(iter)			\
-	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
-	     iter != NULL;				\
-	     iter = mem_cgroup_iter(NULL, iter, NULL))
-
 static inline bool should_force_charge(void)
 {
 	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||