diff mbox series

[v2] mm: memcontrol: fix use after free in mem_cgroup_iter()

Message ID 20190726021247.16162-1-miles.chen@mediatek.com (mailing list archive)
State New, archived
Headers show
Series [v2] mm: memcontrol: fix use after free in mem_cgroup_iter() | expand

Commit Message

Miles Chen July 26, 2019, 2:12 a.m. UTC
This patch is sent to report an use after free in mem_cgroup_iter()
after merging commit: be2657752e9e "mm: memcg: fix use after free in
mem_cgroup_iter()".

I work with android kernel tree (4.9 & 4.14), and the commit:
be2657752e9e "mm: memcg: fix use after free in mem_cgroup_iter()" has
been merged to the trees. However, I can still observe use after free
issues addressed in the commit be2657752e9e.
(on low-end devices, a few times this month)

backtrace:
	css_tryget <- crash here
	mem_cgroup_iter
	shrink_node
	shrink_zones
	do_try_to_free_pages
	try_to_free_pages
	__perform_reclaim
	__alloc_pages_direct_reclaim
	__alloc_pages_slowpath
	__alloc_pages_nodemask

To debug, I poisoned mem_cgroup before freeing it:

static void __mem_cgroup_free(struct mem_cgroup *memcg)
	for_each_node(node)
	free_mem_cgroup_per_node_info(memcg, node);
	free_percpu(memcg->stat);
+       /* poison memcg before freeing it */
+       memset(memcg, 0x78, sizeof(struct mem_cgroup));
	kfree(memcg);
}

The coredump shows the position=0xdbbc2a00 is freed.

(gdb) p/x ((struct mem_cgroup_per_node *)0xe5009e00)->iter[8]
$13 = {position = 0xdbbc2a00, generation = 0x2efd}

0xdbbc2a00:     0xdbbc2e00      0x00000000      0xdbbc2800      0x00000100
0xdbbc2a10:     0x00000200      0x78787878      0x00026218      0x00000000
0xdbbc2a20:     0xdcad6000      0x00000001      0x78787800      0x00000000
0xdbbc2a30:     0x78780000      0x00000000      0x0068fb84      0x78787878
0xdbbc2a40:     0x78787878      0x78787878      0x78787878      0xe3fa5cc0
0xdbbc2a50:     0x78787878      0x78787878      0x00000000      0x00000000
0xdbbc2a60:     0x00000000      0x00000000      0x00000000      0x00000000
0xdbbc2a70:     0x00000000      0x00000000      0x00000000      0x00000000
0xdbbc2a80:     0x00000000      0x00000000      0x00000000      0x00000000
0xdbbc2a90:     0x00000001      0x00000000      0x00000000      0x00100000
0xdbbc2aa0:     0x00000001      0xdbbc2ac8      0x00000000      0x00000000
0xdbbc2ab0:     0x00000000      0x00000000      0x00000000      0x00000000
0xdbbc2ac0:     0x00000000      0x00000000      0xe5b02618      0x00001000
0xdbbc2ad0:     0x00000000      0x78787878      0x78787878      0x78787878
0xdbbc2ae0:     0x78787878      0x78787878      0x78787878      0x78787878
0xdbbc2af0:     0x78787878      0x78787878      0x78787878      0x78787878
0xdbbc2b00:     0x78787878      0x78787878      0x78787878      0x78787878
0xdbbc2b10:     0x78787878      0x78787878      0x78787878      0x78787878
0xdbbc2b20:     0x78787878      0x78787878      0x78787878      0x78787878
0xdbbc2b30:     0x78787878      0x78787878      0x78787878      0x78787878
0xdbbc2b40:     0x78787878      0x78787878      0x78787878      0x78787878
0xdbbc2b50:     0x78787878      0x78787878      0x78787878      0x78787878
0xdbbc2b60:     0x78787878      0x78787878      0x78787878      0x78787878
0xdbbc2b70:     0x78787878      0x78787878      0x78787878      0x78787878
0xdbbc2b80:     0x78787878      0x78787878      0x00000000      0x78787878
0xdbbc2b90:     0x78787878      0x78787878      0x78787878      0x78787878
0xdbbc2ba0:     0x78787878      0x78787878      0x78787878      0x78787878

In the reclaim path, try_to_free_pages() does not setup
sc.target_mem_cgroup and sc is passed to do_try_to_free_pages(), ...,
shrink_node().

In mem_cgroup_iter(), root is set to root_mem_cgroup because
sc->target_mem_cgroup is NULL.
It is possible to assign a memcg to root_mem_cgroup.nodeinfo.iter in
mem_cgroup_iter().

	try_to_free_pages
		struct scan_control sc = {...}, target_mem_cgroup is 0x0;
	do_try_to_free_pages
	shrink_zones
	shrink_node
		 mem_cgroup *root = sc->target_mem_cgroup;
		 memcg = mem_cgroup_iter(root, NULL, &reclaim);
	mem_cgroup_iter()
		if (!root)
			root = root_mem_cgroup;
		...

		css = css_next_descendant_pre(css, &root->css);
		memcg = mem_cgroup_from_css(css);
		cmpxchg(&iter->position, pos, memcg);

My device uses memcg non-hierarchical mode.
When we release a memcg: invalidate_reclaim_iterators() reaches only
dead_memcg and its parents. If non-hierarchical mode is used,
invalidate_reclaim_iterators() never reaches root_mem_cgroup.

static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
{
	struct mem_cgroup *memcg = dead_memcg;

	for (; memcg; memcg = parent_mem_cgroup(memcg)
	...
}

So the use after free scenario looks like:

CPU1						CPU2

try_to_free_pages
do_try_to_free_pages
shrink_zones
shrink_node
mem_cgroup_iter()
    if (!root)
    	root = root_mem_cgroup;
    ...
    css = css_next_descendant_pre(css, &root->css);
    memcg = mem_cgroup_from_css(css);
    cmpxchg(&iter->position, pos, memcg);

					invalidate_reclaim_iterators(memcg);
					...
					__mem_cgroup_free()
						kfree(memcg);

try_to_free_pages
do_try_to_free_pages
shrink_zones
shrink_node
mem_cgroup_iter()
    if (!root)
    	root = root_mem_cgroup;
    ...
    mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
    iter = &mz->iter[reclaim->priority];
    pos = READ_ONCE(iter->position);
    css_tryget(&pos->css) <- use after free

To avoid this, we should also invalidate root_mem_cgroup.nodeinfo.iter in
invalidate_reclaim_iterators().

Change since v1:
Add a comment to explain why we need to handle root_mem_cgroup separately.
Rename invalid_root to invalidate_root.

Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Miles Chen <miles.chen@mediatek.com>
---
 mm/memcontrol.c | 38 ++++++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 10 deletions(-)

Comments

Miles Chen July 26, 2019, 10:20 a.m. UTC | #1
Hi Johannes,

I post patch v2 with proper comment you mentioned.
(I am not sure if I can copy your acked-by to patch v2 directly)

Miles

On Fri, 2019-07-26 at 10:12 +0800, Miles Chen wrote:
> This patch is sent to report an use after free in mem_cgroup_iter()
> after merging commit: be2657752e9e "mm: memcg: fix use after free in
> mem_cgroup_iter()".
> 
> I work with android kernel tree (4.9 & 4.14), and the commit:
> be2657752e9e "mm: memcg: fix use after free in mem_cgroup_iter()" has
> been merged to the trees. However, I can still observe use after free
> issues addressed in the commit be2657752e9e.
> (on low-end devices, a few times this month)
> 
> backtrace:
> 	css_tryget <- crash here
> 	mem_cgroup_iter
> 	shrink_node
> 	shrink_zones
> 	do_try_to_free_pages
> 	try_to_free_pages
> 	__perform_reclaim
> 	__alloc_pages_direct_reclaim
> 	__alloc_pages_slowpath
> 	__alloc_pages_nodemask
> 
> To debug, I poisoned mem_cgroup before freeing it:
> 
> static void __mem_cgroup_free(struct mem_cgroup *memcg)
> 	for_each_node(node)
> 	free_mem_cgroup_per_node_info(memcg, node);
> 	free_percpu(memcg->stat);
> +       /* poison memcg before freeing it */
> +       memset(memcg, 0x78, sizeof(struct mem_cgroup));
> 	kfree(memcg);
> }
> 
> The coredump shows the position=0xdbbc2a00 is freed.
> 
> (gdb) p/x ((struct mem_cgroup_per_node *)0xe5009e00)->iter[8]
> $13 = {position = 0xdbbc2a00, generation = 0x2efd}
> 
> 0xdbbc2a00:     0xdbbc2e00      0x00000000      0xdbbc2800      0x00000100
> 0xdbbc2a10:     0x00000200      0x78787878      0x00026218      0x00000000
> 0xdbbc2a20:     0xdcad6000      0x00000001      0x78787800      0x00000000
> 0xdbbc2a30:     0x78780000      0x00000000      0x0068fb84      0x78787878
> 0xdbbc2a40:     0x78787878      0x78787878      0x78787878      0xe3fa5cc0
> 0xdbbc2a50:     0x78787878      0x78787878      0x00000000      0x00000000
> 0xdbbc2a60:     0x00000000      0x00000000      0x00000000      0x00000000
> 0xdbbc2a70:     0x00000000      0x00000000      0x00000000      0x00000000
> 0xdbbc2a80:     0x00000000      0x00000000      0x00000000      0x00000000
> 0xdbbc2a90:     0x00000001      0x00000000      0x00000000      0x00100000
> 0xdbbc2aa0:     0x00000001      0xdbbc2ac8      0x00000000      0x00000000
> 0xdbbc2ab0:     0x00000000      0x00000000      0x00000000      0x00000000
> 0xdbbc2ac0:     0x00000000      0x00000000      0xe5b02618      0x00001000
> 0xdbbc2ad0:     0x00000000      0x78787878      0x78787878      0x78787878
> 0xdbbc2ae0:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2af0:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b00:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b10:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b20:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b30:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b40:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b50:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b60:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b70:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b80:     0x78787878      0x78787878      0x00000000      0x78787878
> 0xdbbc2b90:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2ba0:     0x78787878      0x78787878      0x78787878      0x78787878
> 
> In the reclaim path, try_to_free_pages() does not setup
> sc.target_mem_cgroup and sc is passed to do_try_to_free_pages(), ...,
> shrink_node().
> 
> In mem_cgroup_iter(), root is set to root_mem_cgroup because
> sc->target_mem_cgroup is NULL.
> It is possible to assign a memcg to root_mem_cgroup.nodeinfo.iter in
> mem_cgroup_iter().
> 
> 	try_to_free_pages
> 		struct scan_control sc = {...}, target_mem_cgroup is 0x0;
> 	do_try_to_free_pages
> 	shrink_zones
> 	shrink_node
> 		 mem_cgroup *root = sc->target_mem_cgroup;
> 		 memcg = mem_cgroup_iter(root, NULL, &reclaim);
> 	mem_cgroup_iter()
> 		if (!root)
> 			root = root_mem_cgroup;
> 		...
> 
> 		css = css_next_descendant_pre(css, &root->css);
> 		memcg = mem_cgroup_from_css(css);
> 		cmpxchg(&iter->position, pos, memcg);
> 
> My device uses memcg non-hierarchical mode.
> When we release a memcg: invalidate_reclaim_iterators() reaches only
> dead_memcg and its parents. If non-hierarchical mode is used,
> invalidate_reclaim_iterators() never reaches root_mem_cgroup.
> 
> static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> {
> 	struct mem_cgroup *memcg = dead_memcg;
> 
> 	for (; memcg; memcg = parent_mem_cgroup(memcg)
> 	...
> }
> 
> So the use after free scenario looks like:
> 
> CPU1						CPU2
> 
> try_to_free_pages
> do_try_to_free_pages
> shrink_zones
> shrink_node
> mem_cgroup_iter()
>     if (!root)
>     	root = root_mem_cgroup;
>     ...
>     css = css_next_descendant_pre(css, &root->css);
>     memcg = mem_cgroup_from_css(css);
>     cmpxchg(&iter->position, pos, memcg);
> 
> 					invalidate_reclaim_iterators(memcg);
> 					...
> 					__mem_cgroup_free()
> 						kfree(memcg);
> 
> try_to_free_pages
> do_try_to_free_pages
> shrink_zones
> shrink_node
> mem_cgroup_iter()
>     if (!root)
>     	root = root_mem_cgroup;
>     ...
>     mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
>     iter = &mz->iter[reclaim->priority];
>     pos = READ_ONCE(iter->position);
>     css_tryget(&pos->css) <- use after free
> 
> To avoid this, we should also invalidate root_mem_cgroup.nodeinfo.iter in
> invalidate_reclaim_iterators().
> 
> Change since v1:
> Add a comment to explain why we need to handle root_mem_cgroup separately.
> Rename invalid_root to invalidate_root.
> 
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Miles Chen <miles.chen@mediatek.com>
> ---
>  mm/memcontrol.c | 38 ++++++++++++++++++++++++++++----------
>  1 file changed, 28 insertions(+), 10 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index cdbb7a84cb6e..09f2191f113b 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1130,26 +1130,44 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
>  		css_put(&prev->css);
>  }
>  
> -static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> +static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
> +					struct mem_cgroup *dead_memcg)
>  {
> -	struct mem_cgroup *memcg = dead_memcg;
>  	struct mem_cgroup_reclaim_iter *iter;
>  	struct mem_cgroup_per_node *mz;
>  	int nid;
>  	int i;
>  
> -	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
> -		for_each_node(nid) {
> -			mz = mem_cgroup_nodeinfo(memcg, nid);
> -			for (i = 0; i <= DEF_PRIORITY; i++) {
> -				iter = &mz->iter[i];
> -				cmpxchg(&iter->position,
> -					dead_memcg, NULL);
> -			}
> +	for_each_node(nid) {
> +		mz = mem_cgroup_nodeinfo(from, nid);
> +		for (i = 0; i <= DEF_PRIORITY; i++) {
> +			iter = &mz->iter[i];
> +			cmpxchg(&iter->position,
> +				dead_memcg, NULL);
>  		}
>  	}
>  }
>  
> +/*
> + * When cgruop1 non-hierarchy mode is used, parent_mem_cgroup() does
> + * not walk all the way up to the cgroup root (root_mem_cgroup). So
> + * we have to handle dead_memcg from cgroup root separately.
> + */
> +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> +{
> +	struct mem_cgroup *memcg = dead_memcg;
> +	int invalidate_root = 0;
> +
> +	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
> +		__invalidate_reclaim_iterators(memcg, dead_memcg);
> +		if (memcg == root_mem_cgroup)
> +			invalidate_root = 1;
> +	}
> +
> +	if (!invalidate_root)
> +		__invalidate_reclaim_iterators(root_mem_cgroup, dead_memcg);
> +}
> +
>  /**
>   * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
>   * @memcg: hierarchy root
Michal Hocko July 26, 2019, 12:49 p.m. UTC | #2
On Fri 26-07-19 10:12:47, Miles Chen wrote:
> This patch is sent to report an use after free in mem_cgroup_iter()
> after merging commit: be2657752e9e "mm: memcg: fix use after free in
> mem_cgroup_iter()".
> 
> I work with android kernel tree (4.9 & 4.14), and the commit:
> be2657752e9e "mm: memcg: fix use after free in mem_cgroup_iter()" has
> been merged to the trees. However, I can still observe use after free
> issues addressed in the commit be2657752e9e.
> (on low-end devices, a few times this month)
> 
> backtrace:
> 	css_tryget <- crash here
> 	mem_cgroup_iter
> 	shrink_node
> 	shrink_zones
> 	do_try_to_free_pages
> 	try_to_free_pages
> 	__perform_reclaim
> 	__alloc_pages_direct_reclaim
> 	__alloc_pages_slowpath
> 	__alloc_pages_nodemask
> 
> To debug, I poisoned mem_cgroup before freeing it:
> 
> static void __mem_cgroup_free(struct mem_cgroup *memcg)
> 	for_each_node(node)
> 	free_mem_cgroup_per_node_info(memcg, node);
> 	free_percpu(memcg->stat);
> +       /* poison memcg before freeing it */
> +       memset(memcg, 0x78, sizeof(struct mem_cgroup));
> 	kfree(memcg);
> }
> 
> The coredump shows the position=0xdbbc2a00 is freed.
> 
> (gdb) p/x ((struct mem_cgroup_per_node *)0xe5009e00)->iter[8]
> $13 = {position = 0xdbbc2a00, generation = 0x2efd}
> 
> 0xdbbc2a00:     0xdbbc2e00      0x00000000      0xdbbc2800      0x00000100
> 0xdbbc2a10:     0x00000200      0x78787878      0x00026218      0x00000000
> 0xdbbc2a20:     0xdcad6000      0x00000001      0x78787800      0x00000000
> 0xdbbc2a30:     0x78780000      0x00000000      0x0068fb84      0x78787878
> 0xdbbc2a40:     0x78787878      0x78787878      0x78787878      0xe3fa5cc0
> 0xdbbc2a50:     0x78787878      0x78787878      0x00000000      0x00000000
> 0xdbbc2a60:     0x00000000      0x00000000      0x00000000      0x00000000
> 0xdbbc2a70:     0x00000000      0x00000000      0x00000000      0x00000000
> 0xdbbc2a80:     0x00000000      0x00000000      0x00000000      0x00000000
> 0xdbbc2a90:     0x00000001      0x00000000      0x00000000      0x00100000
> 0xdbbc2aa0:     0x00000001      0xdbbc2ac8      0x00000000      0x00000000
> 0xdbbc2ab0:     0x00000000      0x00000000      0x00000000      0x00000000
> 0xdbbc2ac0:     0x00000000      0x00000000      0xe5b02618      0x00001000
> 0xdbbc2ad0:     0x00000000      0x78787878      0x78787878      0x78787878
> 0xdbbc2ae0:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2af0:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b00:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b10:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b20:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b30:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b40:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b50:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b60:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b70:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b80:     0x78787878      0x78787878      0x00000000      0x78787878
> 0xdbbc2b90:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2ba0:     0x78787878      0x78787878      0x78787878      0x78787878
> 
> In the reclaim path, try_to_free_pages() does not setup
> sc.target_mem_cgroup and sc is passed to do_try_to_free_pages(), ...,
> shrink_node().
> 
> In mem_cgroup_iter(), root is set to root_mem_cgroup because
> sc->target_mem_cgroup is NULL.
> It is possible to assign a memcg to root_mem_cgroup.nodeinfo.iter in
> mem_cgroup_iter().
> 
> 	try_to_free_pages
> 		struct scan_control sc = {...}, target_mem_cgroup is 0x0;
> 	do_try_to_free_pages
> 	shrink_zones
> 	shrink_node
> 		 mem_cgroup *root = sc->target_mem_cgroup;
> 		 memcg = mem_cgroup_iter(root, NULL, &reclaim);
> 	mem_cgroup_iter()
> 		if (!root)
> 			root = root_mem_cgroup;
> 		...
> 
> 		css = css_next_descendant_pre(css, &root->css);
> 		memcg = mem_cgroup_from_css(css);
> 		cmpxchg(&iter->position, pos, memcg);
> 
> My device uses memcg non-hierarchical mode.
> When we release a memcg: invalidate_reclaim_iterators() reaches only
> dead_memcg and its parents. If non-hierarchical mode is used,
> invalidate_reclaim_iterators() never reaches root_mem_cgroup.
> 
> static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> {
> 	struct mem_cgroup *memcg = dead_memcg;
> 
> 	for (; memcg; memcg = parent_mem_cgroup(memcg)
> 	...
> }
> 
> So the use after free scenario looks like:
> 
> CPU1						CPU2
> 
> try_to_free_pages
> do_try_to_free_pages
> shrink_zones
> shrink_node
> mem_cgroup_iter()
>     if (!root)
>     	root = root_mem_cgroup;
>     ...
>     css = css_next_descendant_pre(css, &root->css);
>     memcg = mem_cgroup_from_css(css);
>     cmpxchg(&iter->position, pos, memcg);
> 
> 					invalidate_reclaim_iterators(memcg);
> 					...
> 					__mem_cgroup_free()
> 						kfree(memcg);
> 
> try_to_free_pages
> do_try_to_free_pages
> shrink_zones
> shrink_node
> mem_cgroup_iter()
>     if (!root)
>     	root = root_mem_cgroup;
>     ...
>     mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
>     iter = &mz->iter[reclaim->priority];
>     pos = READ_ONCE(iter->position);
>     css_tryget(&pos->css) <- use after free

Thanks for the write up. This is really useful.

> To avoid this, we should also invalidate root_mem_cgroup.nodeinfo.iter in
> invalidate_reclaim_iterators().

I am sorry, I didn't get to comment an earlier version but I am
wondering whether it makes more sense to do and explicit invalidation.

[...]
> +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> +{
> +	struct mem_cgroup *memcg = dead_memcg;
> +	int invalidate_root = 0;
> +
> +	for (; memcg; memcg = parent_mem_cgroup(memcg))
> +		__invalidate_reclaim_iterators(memcg, dead_memcg);

	/* here goes your comment */
	if (!dead_memcg->use_hierarchy)
		__invalidate_reclaim_iterators(root_mem_cgroup,	dead_memcg);
> +
> +}

Other than that the patch looks good to me.

Acked-by: Michal Hocko <mhocko@suse.com>
Michal Hocko July 26, 2019, 12:55 p.m. UTC | #3
On Fri 26-07-19 14:49:33, Michal Hocko wrote:
> On Fri 26-07-19 10:12:47, Miles Chen wrote:
> > This patch is sent to report an use after free in mem_cgroup_iter()
> > after merging commit: be2657752e9e "mm: memcg: fix use after free in
> > mem_cgroup_iter()".
> > 
> > I work with android kernel tree (4.9 & 4.14), and the commit:
> > be2657752e9e "mm: memcg: fix use after free in mem_cgroup_iter()" has
> > been merged to the trees. However, I can still observe use after free
> > issues addressed in the commit be2657752e9e.
> > (on low-end devices, a few times this month)
> > 
> > backtrace:
> > 	css_tryget <- crash here
> > 	mem_cgroup_iter
> > 	shrink_node
> > 	shrink_zones
> > 	do_try_to_free_pages
> > 	try_to_free_pages
> > 	__perform_reclaim
> > 	__alloc_pages_direct_reclaim
> > 	__alloc_pages_slowpath
> > 	__alloc_pages_nodemask
> > 
> > To debug, I poisoned mem_cgroup before freeing it:
> > 
> > static void __mem_cgroup_free(struct mem_cgroup *memcg)
> > 	for_each_node(node)
> > 	free_mem_cgroup_per_node_info(memcg, node);
> > 	free_percpu(memcg->stat);
> > +       /* poison memcg before freeing it */
> > +       memset(memcg, 0x78, sizeof(struct mem_cgroup));
> > 	kfree(memcg);
> > }
> > 
> > The coredump shows the position=0xdbbc2a00 is freed.
> > 
> > (gdb) p/x ((struct mem_cgroup_per_node *)0xe5009e00)->iter[8]
> > $13 = {position = 0xdbbc2a00, generation = 0x2efd}
> > 
> > 0xdbbc2a00:     0xdbbc2e00      0x00000000      0xdbbc2800      0x00000100
> > 0xdbbc2a10:     0x00000200      0x78787878      0x00026218      0x00000000
> > 0xdbbc2a20:     0xdcad6000      0x00000001      0x78787800      0x00000000
> > 0xdbbc2a30:     0x78780000      0x00000000      0x0068fb84      0x78787878
> > 0xdbbc2a40:     0x78787878      0x78787878      0x78787878      0xe3fa5cc0
> > 0xdbbc2a50:     0x78787878      0x78787878      0x00000000      0x00000000
> > 0xdbbc2a60:     0x00000000      0x00000000      0x00000000      0x00000000
> > 0xdbbc2a70:     0x00000000      0x00000000      0x00000000      0x00000000
> > 0xdbbc2a80:     0x00000000      0x00000000      0x00000000      0x00000000
> > 0xdbbc2a90:     0x00000001      0x00000000      0x00000000      0x00100000
> > 0xdbbc2aa0:     0x00000001      0xdbbc2ac8      0x00000000      0x00000000
> > 0xdbbc2ab0:     0x00000000      0x00000000      0x00000000      0x00000000
> > 0xdbbc2ac0:     0x00000000      0x00000000      0xe5b02618      0x00001000
> > 0xdbbc2ad0:     0x00000000      0x78787878      0x78787878      0x78787878
> > 0xdbbc2ae0:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2af0:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b00:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b10:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b20:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b30:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b40:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b50:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b60:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b70:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b80:     0x78787878      0x78787878      0x00000000      0x78787878
> > 0xdbbc2b90:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2ba0:     0x78787878      0x78787878      0x78787878      0x78787878
> > 
> > In the reclaim path, try_to_free_pages() does not setup
> > sc.target_mem_cgroup and sc is passed to do_try_to_free_pages(), ...,
> > shrink_node().
> > 
> > In mem_cgroup_iter(), root is set to root_mem_cgroup because
> > sc->target_mem_cgroup is NULL.
> > It is possible to assign a memcg to root_mem_cgroup.nodeinfo.iter in
> > mem_cgroup_iter().
> > 
> > 	try_to_free_pages
> > 		struct scan_control sc = {...}, target_mem_cgroup is 0x0;
> > 	do_try_to_free_pages
> > 	shrink_zones
> > 	shrink_node
> > 		 mem_cgroup *root = sc->target_mem_cgroup;
> > 		 memcg = mem_cgroup_iter(root, NULL, &reclaim);
> > 	mem_cgroup_iter()
> > 		if (!root)
> > 			root = root_mem_cgroup;
> > 		...
> > 
> > 		css = css_next_descendant_pre(css, &root->css);
> > 		memcg = mem_cgroup_from_css(css);
> > 		cmpxchg(&iter->position, pos, memcg);
> > 
> > My device uses memcg non-hierarchical mode.
> > When we release a memcg: invalidate_reclaim_iterators() reaches only
> > dead_memcg and its parents. If non-hierarchical mode is used,
> > invalidate_reclaim_iterators() never reaches root_mem_cgroup.
> > 
> > static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> > {
> > 	struct mem_cgroup *memcg = dead_memcg;
> > 
> > 	for (; memcg; memcg = parent_mem_cgroup(memcg)
> > 	...
> > }
> > 
> > So the use after free scenario looks like:
> > 
> > CPU1						CPU2
> > 
> > try_to_free_pages
> > do_try_to_free_pages
> > shrink_zones
> > shrink_node
> > mem_cgroup_iter()
> >     if (!root)
> >     	root = root_mem_cgroup;
> >     ...
> >     css = css_next_descendant_pre(css, &root->css);
> >     memcg = mem_cgroup_from_css(css);
> >     cmpxchg(&iter->position, pos, memcg);
> > 
> > 					invalidate_reclaim_iterators(memcg);
> > 					...
> > 					__mem_cgroup_free()
> > 						kfree(memcg);
> > 
> > try_to_free_pages
> > do_try_to_free_pages
> > shrink_zones
> > shrink_node
> > mem_cgroup_iter()
> >     if (!root)
> >     	root = root_mem_cgroup;
> >     ...
> >     mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
> >     iter = &mz->iter[reclaim->priority];
> >     pos = READ_ONCE(iter->position);
> >     css_tryget(&pos->css) <- use after free
> 
> Thanks for the write up. This is really useful.
> 
> > To avoid this, we should also invalidate root_mem_cgroup.nodeinfo.iter in
> > invalidate_reclaim_iterators().
> 
> I am sorry, I didn't get to comment an earlier version but I am
> wondering whether it makes more sense to do and explicit invalidation.
> 
> [...]
> > +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> > +{
> > +	struct mem_cgroup *memcg = dead_memcg;
> > +	int invalidate_root = 0;
> > +
> > +	for (; memcg; memcg = parent_mem_cgroup(memcg))
> > +		__invalidate_reclaim_iterators(memcg, dead_memcg);
> 
> 	/* here goes your comment */
> 	if (!dead_memcg->use_hierarchy)
> 		__invalidate_reclaim_iterators(root_mem_cgroup,	dead_memcg);
> > +
> > +}
> 
> Other than that the patch looks good to me.
> 
> Acked-by: Michal Hocko <mhocko@suse.com>

Btw. I believe we want to push this to stable trees as well. I think it
goes all the way down to 5ac8fb31ad2e ("mm: memcontrol: convert reclaim
iterator to simple css refcounting"). Unless I am missing something a
Fixes: tag would be really helpful.
Miles Chen July 26, 2019, 11:47 p.m. UTC | #4
On Fri, 2019-07-26 at 14:55 +0200, Michal Hocko wrote:
> On Fri 26-07-19 14:49:33, Michal Hocko wrote:
> > On Fri 26-07-19 10:12:47, Miles Chen wrote:
> > > This patch is sent to report an use after free in mem_cgroup_iter()
> > > after merging commit: be2657752e9e "mm: memcg: fix use after free in
> > > mem_cgroup_iter()".
> > > 
> > > I work with android kernel tree (4.9 & 4.14), and the commit:
> > > be2657752e9e "mm: memcg: fix use after free in mem_cgroup_iter()" has
> > > been merged to the trees. However, I can still observe use after free
> > > issues addressed in the commit be2657752e9e.
> > > (on low-end devices, a few times this month)
> > > 
> > > backtrace:
> > > 	css_tryget <- crash here
> > > 	mem_cgroup_iter
> > > 	shrink_node
> > > 	shrink_zones
> > > 	do_try_to_free_pages
> > > 	try_to_free_pages
> > > 	__perform_reclaim
> > > 	__alloc_pages_direct_reclaim
> > > 	__alloc_pages_slowpath
> > > 	__alloc_pages_nodemask
> > > 
> > > To debug, I poisoned mem_cgroup before freeing it:
> > > 
> > > static void __mem_cgroup_free(struct mem_cgroup *memcg)
> > > 	for_each_node(node)
> > > 	free_mem_cgroup_per_node_info(memcg, node);
> > > 	free_percpu(memcg->stat);
> > > +       /* poison memcg before freeing it */
> > > +       memset(memcg, 0x78, sizeof(struct mem_cgroup));
> > > 	kfree(memcg);
> > > }
> > > 
> > > The coredump shows the position=0xdbbc2a00 is freed.
> > > 
> > > (gdb) p/x ((struct mem_cgroup_per_node *)0xe5009e00)->iter[8]
> > > $13 = {position = 0xdbbc2a00, generation = 0x2efd}
> > > 
> > > 0xdbbc2a00:     0xdbbc2e00      0x00000000      0xdbbc2800      0x00000100
> > > 0xdbbc2a10:     0x00000200      0x78787878      0x00026218      0x00000000
> > > 0xdbbc2a20:     0xdcad6000      0x00000001      0x78787800      0x00000000
> > > 0xdbbc2a30:     0x78780000      0x00000000      0x0068fb84      0x78787878
> > > 0xdbbc2a40:     0x78787878      0x78787878      0x78787878      0xe3fa5cc0
> > > 0xdbbc2a50:     0x78787878      0x78787878      0x00000000      0x00000000
> > > 0xdbbc2a60:     0x00000000      0x00000000      0x00000000      0x00000000
> > > 0xdbbc2a70:     0x00000000      0x00000000      0x00000000      0x00000000
> > > 0xdbbc2a80:     0x00000000      0x00000000      0x00000000      0x00000000
> > > 0xdbbc2a90:     0x00000001      0x00000000      0x00000000      0x00100000
> > > 0xdbbc2aa0:     0x00000001      0xdbbc2ac8      0x00000000      0x00000000
> > > 0xdbbc2ab0:     0x00000000      0x00000000      0x00000000      0x00000000
> > > 0xdbbc2ac0:     0x00000000      0x00000000      0xe5b02618      0x00001000
> > > 0xdbbc2ad0:     0x00000000      0x78787878      0x78787878      0x78787878
> > > 0xdbbc2ae0:     0x78787878      0x78787878      0x78787878      0x78787878
> > > 0xdbbc2af0:     0x78787878      0x78787878      0x78787878      0x78787878
> > > 0xdbbc2b00:     0x78787878      0x78787878      0x78787878      0x78787878
> > > 0xdbbc2b10:     0x78787878      0x78787878      0x78787878      0x78787878
> > > 0xdbbc2b20:     0x78787878      0x78787878      0x78787878      0x78787878
> > > 0xdbbc2b30:     0x78787878      0x78787878      0x78787878      0x78787878
> > > 0xdbbc2b40:     0x78787878      0x78787878      0x78787878      0x78787878
> > > 0xdbbc2b50:     0x78787878      0x78787878      0x78787878      0x78787878
> > > 0xdbbc2b60:     0x78787878      0x78787878      0x78787878      0x78787878
> > > 0xdbbc2b70:     0x78787878      0x78787878      0x78787878      0x78787878
> > > 0xdbbc2b80:     0x78787878      0x78787878      0x00000000      0x78787878
> > > 0xdbbc2b90:     0x78787878      0x78787878      0x78787878      0x78787878
> > > 0xdbbc2ba0:     0x78787878      0x78787878      0x78787878      0x78787878
> > > 
> > > In the reclaim path, try_to_free_pages() does not setup
> > > sc.target_mem_cgroup and sc is passed to do_try_to_free_pages(), ...,
> > > shrink_node().
> > > 
> > > In mem_cgroup_iter(), root is set to root_mem_cgroup because
> > > sc->target_mem_cgroup is NULL.
> > > It is possible to assign a memcg to root_mem_cgroup.nodeinfo.iter in
> > > mem_cgroup_iter().
> > > 
> > > 	try_to_free_pages
> > > 		struct scan_control sc = {...}, target_mem_cgroup is 0x0;
> > > 	do_try_to_free_pages
> > > 	shrink_zones
> > > 	shrink_node
> > > 		 mem_cgroup *root = sc->target_mem_cgroup;
> > > 		 memcg = mem_cgroup_iter(root, NULL, &reclaim);
> > > 	mem_cgroup_iter()
> > > 		if (!root)
> > > 			root = root_mem_cgroup;
> > > 		...
> > > 
> > > 		css = css_next_descendant_pre(css, &root->css);
> > > 		memcg = mem_cgroup_from_css(css);
> > > 		cmpxchg(&iter->position, pos, memcg);
> > > 
> > > My device uses memcg non-hierarchical mode.
> > > When we release a memcg: invalidate_reclaim_iterators() reaches only
> > > dead_memcg and its parents. If non-hierarchical mode is used,
> > > invalidate_reclaim_iterators() never reaches root_mem_cgroup.
> > > 
> > > static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> > > {
> > > 	struct mem_cgroup *memcg = dead_memcg;
> > > 
> > > 	for (; memcg; memcg = parent_mem_cgroup(memcg)
> > > 	...
> > > }
> > > 
> > > So the use after free scenario looks like:
> > > 
> > > CPU1						CPU2
> > > 
> > > try_to_free_pages
> > > do_try_to_free_pages
> > > shrink_zones
> > > shrink_node
> > > mem_cgroup_iter()
> > >     if (!root)
> > >     	root = root_mem_cgroup;
> > >     ...
> > >     css = css_next_descendant_pre(css, &root->css);
> > >     memcg = mem_cgroup_from_css(css);
> > >     cmpxchg(&iter->position, pos, memcg);
> > > 
> > > 					invalidate_reclaim_iterators(memcg);
> > > 					...
> > > 					__mem_cgroup_free()
> > > 						kfree(memcg);
> > > 
> > > try_to_free_pages
> > > do_try_to_free_pages
> > > shrink_zones
> > > shrink_node
> > > mem_cgroup_iter()
> > >     if (!root)
> > >     	root = root_mem_cgroup;
> > >     ...
> > >     mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
> > >     iter = &mz->iter[reclaim->priority];
> > >     pos = READ_ONCE(iter->position);
> > >     css_tryget(&pos->css) <- use after free
> > 
> > Thanks for the write up. This is really useful.
> > 
> > > To avoid this, we should also invalidate root_mem_cgroup.nodeinfo.iter in
> > > invalidate_reclaim_iterators().
> > 
> > I am sorry, I didn't get to comment an earlier version but I am
> > wondering whether it makes more sense to do and explicit invalidation.
> > 

I think we should keep the original v2 version, the reason is the 
!use_hierarchy does not imply we can reach root_mem_cgroup:

cd /sys/fs/cgroup/memory/0
mkdir 1
cd /sys/fs/cgroup/memory/0/1
echo 1 > memory.use_hierarchy // only 1 and its children has
use_hierarchy set
mkdir 2

rmdir 2 // parent_mem_cgroup(2) goes up to 1

> > [...]
> > > +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> > > +{
> > > +	struct mem_cgroup *memcg = dead_memcg;
> > > +	int invalidate_root = 0;
> > > +
> > > +	for (; memcg; memcg = parent_mem_cgroup(memcg))
> > > +		__invalidate_reclaim_iterators(memcg, dead_memcg);
> > 
> > 	/* here goes your comment */
> > 	if (!dead_memcg->use_hierarchy)
> > 		__invalidate_reclaim_iterators(root_mem_cgroup,	dead_memcg);
> > > +
> > > +}
> > 
> > Other than that the patch looks good to me.
> > 
> > Acked-by: Michal Hocko <mhocko@suse.com>
> 
> Btw. I believe we want to push this to stable trees as well. I think it
> goes all the way down to 5ac8fb31ad2e ("mm: memcontrol: convert reclaim
> iterator to simple css refcounting"). Unless I am missing something a
> Fixes: tag would be really helpful.

No problem. I'll add the fix tag to patch v3.
Fixes: 5ac8fb31ad2e ("mm: memcontrol: convert reclaim iterator to simple
css refcounting")


Miles
Miles Chen July 27, 2019, 6 a.m. UTC | #5
On Sat, 2019-07-27 at 07:47 +0800, Miles Chen wrote:
> On Fri, 2019-07-26 at 14:55 +0200, Michal Hocko wrote:
> > On Fri 26-07-19 14:49:33, Michal Hocko wrote:
> > > On Fri 26-07-19 10:12:47, Miles Chen wrote:
> > > > This patch is sent to report an use after free in mem_cgroup_iter()
> > > > after merging commit: be2657752e9e "mm: memcg: fix use after free in
> > > > mem_cgroup_iter()".
> > > > 
> > > > I work with android kernel tree (4.9 & 4.14), and the commit:
> > > > be2657752e9e "mm: memcg: fix use after free in mem_cgroup_iter()" has
> > > > been merged to the trees. However, I can still observe use after free
> > > > issues addressed in the commit be2657752e9e.
> > > > (on low-end devices, a few times this month)
> > > > 
> > > > backtrace:
> > > > 	css_tryget <- crash here
> > > > 	mem_cgroup_iter
> > > > 	shrink_node
> > > > 	shrink_zones
> > > > 	do_try_to_free_pages
> > > > 	try_to_free_pages
> > > > 	__perform_reclaim
> > > > 	__alloc_pages_direct_reclaim
> > > > 	__alloc_pages_slowpath
> > > > 	__alloc_pages_nodemask
> > > > 
> > > > To debug, I poisoned mem_cgroup before freeing it:
> > > > 
> > > > static void __mem_cgroup_free(struct mem_cgroup *memcg)
> > > > 	for_each_node(node)
> > > > 	free_mem_cgroup_per_node_info(memcg, node);
> > > > 	free_percpu(memcg->stat);
> > > > +       /* poison memcg before freeing it */
> > > > +       memset(memcg, 0x78, sizeof(struct mem_cgroup));
> > > > 	kfree(memcg);
> > > > }
> > > > 
> > > > The coredump shows the position=0xdbbc2a00 is freed.
> > > > 
> > > > (gdb) p/x ((struct mem_cgroup_per_node *)0xe5009e00)->iter[8]
> > > > $13 = {position = 0xdbbc2a00, generation = 0x2efd}
> > > > 
> > > > 0xdbbc2a00:     0xdbbc2e00      0x00000000      0xdbbc2800      0x00000100
> > > > 0xdbbc2a10:     0x00000200      0x78787878      0x00026218      0x00000000
> > > > 0xdbbc2a20:     0xdcad6000      0x00000001      0x78787800      0x00000000
> > > > 0xdbbc2a30:     0x78780000      0x00000000      0x0068fb84      0x78787878
> > > > 0xdbbc2a40:     0x78787878      0x78787878      0x78787878      0xe3fa5cc0
> > > > 0xdbbc2a50:     0x78787878      0x78787878      0x00000000      0x00000000
> > > > 0xdbbc2a60:     0x00000000      0x00000000      0x00000000      0x00000000
> > > > 0xdbbc2a70:     0x00000000      0x00000000      0x00000000      0x00000000
> > > > 0xdbbc2a80:     0x00000000      0x00000000      0x00000000      0x00000000
> > > > 0xdbbc2a90:     0x00000001      0x00000000      0x00000000      0x00100000
> > > > 0xdbbc2aa0:     0x00000001      0xdbbc2ac8      0x00000000      0x00000000
> > > > 0xdbbc2ab0:     0x00000000      0x00000000      0x00000000      0x00000000
> > > > 0xdbbc2ac0:     0x00000000      0x00000000      0xe5b02618      0x00001000
> > > > 0xdbbc2ad0:     0x00000000      0x78787878      0x78787878      0x78787878
> > > > 0xdbbc2ae0:     0x78787878      0x78787878      0x78787878      0x78787878
> > > > 0xdbbc2af0:     0x78787878      0x78787878      0x78787878      0x78787878
> > > > 0xdbbc2b00:     0x78787878      0x78787878      0x78787878      0x78787878
> > > > 0xdbbc2b10:     0x78787878      0x78787878      0x78787878      0x78787878
> > > > 0xdbbc2b20:     0x78787878      0x78787878      0x78787878      0x78787878
> > > > 0xdbbc2b30:     0x78787878      0x78787878      0x78787878      0x78787878
> > > > 0xdbbc2b40:     0x78787878      0x78787878      0x78787878      0x78787878
> > > > 0xdbbc2b50:     0x78787878      0x78787878      0x78787878      0x78787878
> > > > 0xdbbc2b60:     0x78787878      0x78787878      0x78787878      0x78787878
> > > > 0xdbbc2b70:     0x78787878      0x78787878      0x78787878      0x78787878
> > > > 0xdbbc2b80:     0x78787878      0x78787878      0x00000000      0x78787878
> > > > 0xdbbc2b90:     0x78787878      0x78787878      0x78787878      0x78787878
> > > > 0xdbbc2ba0:     0x78787878      0x78787878      0x78787878      0x78787878
> > > > 
> > > > In the reclaim path, try_to_free_pages() does not setup
> > > > sc.target_mem_cgroup and sc is passed to do_try_to_free_pages(), ...,
> > > > shrink_node().
> > > > 
> > > > In mem_cgroup_iter(), root is set to root_mem_cgroup because
> > > > sc->target_mem_cgroup is NULL.
> > > > It is possible to assign a memcg to root_mem_cgroup.nodeinfo.iter in
> > > > mem_cgroup_iter().
> > > > 
> > > > 	try_to_free_pages
> > > > 		struct scan_control sc = {...}, target_mem_cgroup is 0x0;
> > > > 	do_try_to_free_pages
> > > > 	shrink_zones
> > > > 	shrink_node
> > > > 		 mem_cgroup *root = sc->target_mem_cgroup;
> > > > 		 memcg = mem_cgroup_iter(root, NULL, &reclaim);
> > > > 	mem_cgroup_iter()
> > > > 		if (!root)
> > > > 			root = root_mem_cgroup;
> > > > 		...
> > > > 
> > > > 		css = css_next_descendant_pre(css, &root->css);
> > > > 		memcg = mem_cgroup_from_css(css);
> > > > 		cmpxchg(&iter->position, pos, memcg);
> > > > 
> > > > My device uses memcg non-hierarchical mode.
> > > > When we release a memcg: invalidate_reclaim_iterators() reaches only
> > > > dead_memcg and its parents. If non-hierarchical mode is used,
> > > > invalidate_reclaim_iterators() never reaches root_mem_cgroup.
> > > > 
> > > > static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> > > > {
> > > > 	struct mem_cgroup *memcg = dead_memcg;
> > > > 
> > > > 	for (; memcg; memcg = parent_mem_cgroup(memcg)
> > > > 	...
> > > > }
> > > > 
> > > > So the use after free scenario looks like:
> > > > 
> > > > CPU1						CPU2
> > > > 
> > > > try_to_free_pages
> > > > do_try_to_free_pages
> > > > shrink_zones
> > > > shrink_node
> > > > mem_cgroup_iter()
> > > >     if (!root)
> > > >     	root = root_mem_cgroup;
> > > >     ...
> > > >     css = css_next_descendant_pre(css, &root->css);
> > > >     memcg = mem_cgroup_from_css(css);
> > > >     cmpxchg(&iter->position, pos, memcg);
> > > > 
> > > > 					invalidate_reclaim_iterators(memcg);
> > > > 					...
> > > > 					__mem_cgroup_free()
> > > > 						kfree(memcg);
> > > > 
> > > > try_to_free_pages
> > > > do_try_to_free_pages
> > > > shrink_zones
> > > > shrink_node
> > > > mem_cgroup_iter()
> > > >     if (!root)
> > > >     	root = root_mem_cgroup;
> > > >     ...
> > > >     mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
> > > >     iter = &mz->iter[reclaim->priority];
> > > >     pos = READ_ONCE(iter->position);
> > > >     css_tryget(&pos->css) <- use after free
> > > 
> > > Thanks for the write up. This is really useful.
> > > 
> > > > To avoid this, we should also invalidate root_mem_cgroup.nodeinfo.iter in
> > > > invalidate_reclaim_iterators().
> > > 
> > > I am sorry, I didn't get to comment an earlier version but I am
> > > wondering whether it makes more sense to do and explicit invalidation.
> > > 
> 
> I think we should keep the original v2 version, the reason is the 
> !use_hierarchy does not imply we can reach root_mem_cgroup:

Sorry I want to correct myself:

(dead_memcg->use_hierarchy == true) does not guarantee that we can
reach root_mem_cgroup by parent_mem_cgroup(dead_memcg)

> cd /sys/fs/cgroup/memory/0
> mkdir 1
> cd /sys/fs/cgroup/memory/0/1
> echo 1 > memory.use_hierarchy // only 1 and its children has
> use_hierarchy set
> mkdir 2
> 
> rmdir 2 // parent_mem_cgroup(2) goes up to 1
> 
> > > [...]
> > > > +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> > > > +{
> > > > +	struct mem_cgroup *memcg = dead_memcg;
> > > > +	int invalidate_root = 0;
> > > > +
> > > > +	for (; memcg; memcg = parent_mem_cgroup(memcg))
> > > > +		__invalidate_reclaim_iterators(memcg, dead_memcg);
> > > 
> > > 	/* here goes your comment */
> > > 	if (!dead_memcg->use_hierarchy)
> > > 		__invalidate_reclaim_iterators(root_mem_cgroup,	dead_memcg);
> > > > +
> > > > +}
> > > 
> > > Other than that the patch looks good to me.
> > > 
> > > Acked-by: Michal Hocko <mhocko@suse.com>
> > 
> > Btw. I believe we want to push this to stable trees as well. I think it
> > goes all the way down to 5ac8fb31ad2e ("mm: memcontrol: convert reclaim
> > iterator to simple css refcounting"). Unless I am missing something a
> > Fixes: tag would be really helpful.
> 
> No problem. I'll add the fix tag to patch v3.
> Fixes: 5ac8fb31ad2e ("mm: memcontrol: convert reclaim iterator to simple
> css refcounting")
> 
> 
> Miles
> 
>
Michal Hocko July 29, 2019, 6:25 a.m. UTC | #6
On Sat 27-07-19 07:47:58, Miles Chen wrote:
> On Fri, 2019-07-26 at 14:55 +0200, Michal Hocko wrote:
[...]
> > > I am sorry, I didn't get to comment an earlier version but I am
> > > wondering whether it makes more sense to do and explicit invalidation.
> > > 
> 
> I think we should keep the original v2 version, the reason is the 
> !use_hierarchy does not imply we can reach root_mem_cgroup:
> 
> cd /sys/fs/cgroup/memory/0
> mkdir 1
> cd /sys/fs/cgroup/memory/0/1
> echo 1 > memory.use_hierarchy // only 1 and its children has
> use_hierarchy set
> mkdir 2
> 
> rmdir 2 // parent_mem_cgroup(2) goes up to 1

You are right I have missed this case. I am not sure anybody is using
layout like that but your fix is more robust and catches that case as
well.

Acked-by: Michal Hocko <mhocko@suse.com>
Johannes Weiner July 29, 2019, 4:06 p.m. UTC | #7
On Fri, Jul 26, 2019 at 10:12:47AM +0800, Miles Chen wrote:
> This patch is sent to report an use after free in mem_cgroup_iter()
> after merging commit: be2657752e9e "mm: memcg: fix use after free in
> mem_cgroup_iter()".
> 
> I work with android kernel tree (4.9 & 4.14), and the commit:
> be2657752e9e "mm: memcg: fix use after free in mem_cgroup_iter()" has
> been merged to the trees. However, I can still observe use after free
> issues addressed in the commit be2657752e9e.
> (on low-end devices, a few times this month)
> 
> backtrace:
> 	css_tryget <- crash here
> 	mem_cgroup_iter
> 	shrink_node
> 	shrink_zones
> 	do_try_to_free_pages
> 	try_to_free_pages
> 	__perform_reclaim
> 	__alloc_pages_direct_reclaim
> 	__alloc_pages_slowpath
> 	__alloc_pages_nodemask
> 
> To debug, I poisoned mem_cgroup before freeing it:
> 
> static void __mem_cgroup_free(struct mem_cgroup *memcg)
> 	for_each_node(node)
> 	free_mem_cgroup_per_node_info(memcg, node);
> 	free_percpu(memcg->stat);
> +       /* poison memcg before freeing it */
> +       memset(memcg, 0x78, sizeof(struct mem_cgroup));
> 	kfree(memcg);
> }
> 
> The coredump shows the position=0xdbbc2a00 is freed.
> 
> (gdb) p/x ((struct mem_cgroup_per_node *)0xe5009e00)->iter[8]
> $13 = {position = 0xdbbc2a00, generation = 0x2efd}
> 
> 0xdbbc2a00:     0xdbbc2e00      0x00000000      0xdbbc2800      0x00000100
> 0xdbbc2a10:     0x00000200      0x78787878      0x00026218      0x00000000
> 0xdbbc2a20:     0xdcad6000      0x00000001      0x78787800      0x00000000
> 0xdbbc2a30:     0x78780000      0x00000000      0x0068fb84      0x78787878
> 0xdbbc2a40:     0x78787878      0x78787878      0x78787878      0xe3fa5cc0
> 0xdbbc2a50:     0x78787878      0x78787878      0x00000000      0x00000000
> 0xdbbc2a60:     0x00000000      0x00000000      0x00000000      0x00000000
> 0xdbbc2a70:     0x00000000      0x00000000      0x00000000      0x00000000
> 0xdbbc2a80:     0x00000000      0x00000000      0x00000000      0x00000000
> 0xdbbc2a90:     0x00000001      0x00000000      0x00000000      0x00100000
> 0xdbbc2aa0:     0x00000001      0xdbbc2ac8      0x00000000      0x00000000
> 0xdbbc2ab0:     0x00000000      0x00000000      0x00000000      0x00000000
> 0xdbbc2ac0:     0x00000000      0x00000000      0xe5b02618      0x00001000
> 0xdbbc2ad0:     0x00000000      0x78787878      0x78787878      0x78787878
> 0xdbbc2ae0:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2af0:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b00:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b10:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b20:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b30:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b40:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b50:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b60:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b70:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2b80:     0x78787878      0x78787878      0x00000000      0x78787878
> 0xdbbc2b90:     0x78787878      0x78787878      0x78787878      0x78787878
> 0xdbbc2ba0:     0x78787878      0x78787878      0x78787878      0x78787878
> 
> In the reclaim path, try_to_free_pages() does not setup
> sc.target_mem_cgroup and sc is passed to do_try_to_free_pages(), ...,
> shrink_node().
> 
> In mem_cgroup_iter(), root is set to root_mem_cgroup because
> sc->target_mem_cgroup is NULL.
> It is possible to assign a memcg to root_mem_cgroup.nodeinfo.iter in
> mem_cgroup_iter().
> 
> 	try_to_free_pages
> 		struct scan_control sc = {...}, target_mem_cgroup is 0x0;
> 	do_try_to_free_pages
> 	shrink_zones
> 	shrink_node
> 		 mem_cgroup *root = sc->target_mem_cgroup;
> 		 memcg = mem_cgroup_iter(root, NULL, &reclaim);
> 	mem_cgroup_iter()
> 		if (!root)
> 			root = root_mem_cgroup;
> 		...
> 
> 		css = css_next_descendant_pre(css, &root->css);
> 		memcg = mem_cgroup_from_css(css);
> 		cmpxchg(&iter->position, pos, memcg);
> 
> My device uses memcg non-hierarchical mode.
> When we release a memcg: invalidate_reclaim_iterators() reaches only
> dead_memcg and its parents. If non-hierarchical mode is used,
> invalidate_reclaim_iterators() never reaches root_mem_cgroup.
> 
> static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> {
> 	struct mem_cgroup *memcg = dead_memcg;
> 
> 	for (; memcg; memcg = parent_mem_cgroup(memcg)
> 	...
> }
> 
> So the use after free scenario looks like:
> 
> CPU1						CPU2
> 
> try_to_free_pages
> do_try_to_free_pages
> shrink_zones
> shrink_node
> mem_cgroup_iter()
>     if (!root)
>     	root = root_mem_cgroup;
>     ...
>     css = css_next_descendant_pre(css, &root->css);
>     memcg = mem_cgroup_from_css(css);
>     cmpxchg(&iter->position, pos, memcg);
> 
> 					invalidate_reclaim_iterators(memcg);
> 					...
> 					__mem_cgroup_free()
> 						kfree(memcg);
> 
> try_to_free_pages
> do_try_to_free_pages
> shrink_zones
> shrink_node
> mem_cgroup_iter()
>     if (!root)
>     	root = root_mem_cgroup;
>     ...
>     mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
>     iter = &mz->iter[reclaim->priority];
>     pos = READ_ONCE(iter->position);
>     css_tryget(&pos->css) <- use after free
> 
> To avoid this, we should also invalidate root_mem_cgroup.nodeinfo.iter in
> invalidate_reclaim_iterators().
> 
> Change since v1:
> Add a comment to explain why we need to handle root_mem_cgroup separately.
> Rename invalid_root to invalidate_root.
> 
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Miles Chen <miles.chen@mediatek.com>
> ---
>  mm/memcontrol.c | 38 ++++++++++++++++++++++++++++----------
>  1 file changed, 28 insertions(+), 10 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index cdbb7a84cb6e..09f2191f113b 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1130,26 +1130,44 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
>  		css_put(&prev->css);
>  }
>  
> -static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> +static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
> +					struct mem_cgroup *dead_memcg)
>  {
> -	struct mem_cgroup *memcg = dead_memcg;
>  	struct mem_cgroup_reclaim_iter *iter;
>  	struct mem_cgroup_per_node *mz;
>  	int nid;
>  	int i;
>  
> -	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
> -		for_each_node(nid) {
> -			mz = mem_cgroup_nodeinfo(memcg, nid);
> -			for (i = 0; i <= DEF_PRIORITY; i++) {
> -				iter = &mz->iter[i];
> -				cmpxchg(&iter->position,
> -					dead_memcg, NULL);
> -			}
> +	for_each_node(nid) {
> +		mz = mem_cgroup_nodeinfo(from, nid);
> +		for (i = 0; i <= DEF_PRIORITY; i++) {
> +			iter = &mz->iter[i];
> +			cmpxchg(&iter->position,
> +				dead_memcg, NULL);
>  		}
>  	}
>  }
>  
> +/*
> + * When cgruop1 non-hierarchy mode is used, parent_mem_cgroup() does
> + * not walk all the way up to the cgroup root (root_mem_cgroup). So
> + * we have to handle dead_memcg from cgroup root separately.
> + */
> +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> +{
> +	struct mem_cgroup *memcg = dead_memcg;
> +	int invalidate_root = 0;
> +
> +	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
> +		__invalidate_reclaim_iterators(memcg, dead_memcg);
> +		if (memcg == root_mem_cgroup)
> +			invalidate_root = 1;
> +	}
> +
> +	if (!invalidate_root)
> +		__invalidate_reclaim_iterators(root_mem_cgroup, dead_memcg);

"invalidate_root" suggests we still have to invalidate the root, but
the variable works the opposite way. How about dropping it altogether
and moving the comment directly to where the decision is made:

	struct mem_cgroup *memcg = dead_memcg;

	do {
		__invalidate_reclaim_iterators(memcg, dead_memcg);
		last = memcg;
	} while ((memcg = parent_mem_cgroup(memcg)));

	/*
	 * When cgruop1 non-hierarchy mode is used,
	 * parent_mem_cgroup() does not walk all the way up to the
	 * cgroup root (root_mem_cgroup). So we have to handle
	 * dead_memcg from cgroup root separately.
	 */
	if (last != root_mem_cgroup)
		__invalidate_reclaim_iterators(root_mem_cgroup, dead_memcg);
Miles Chen July 30, 2019, 1:48 a.m. UTC | #8
On Mon, 2019-07-29 at 12:06 -0400, Johannes Weiner wrote:
> On Fri, Jul 26, 2019 at 10:12:47AM +0800, Miles Chen wrote:
> > This patch is sent to report an use after free in mem_cgroup_iter()
> > after merging commit: be2657752e9e "mm: memcg: fix use after free in
> > mem_cgroup_iter()".
> > 
> > I work with android kernel tree (4.9 & 4.14), and the commit:
> > be2657752e9e "mm: memcg: fix use after free in mem_cgroup_iter()" has
> > been merged to the trees. However, I can still observe use after free
> > issues addressed in the commit be2657752e9e.
> > (on low-end devices, a few times this month)
> > 
> > backtrace:
> > 	css_tryget <- crash here
> > 	mem_cgroup_iter
> > 	shrink_node
> > 	shrink_zones
> > 	do_try_to_free_pages
> > 	try_to_free_pages
> > 	__perform_reclaim
> > 	__alloc_pages_direct_reclaim
> > 	__alloc_pages_slowpath
> > 	__alloc_pages_nodemask
> > 
> > To debug, I poisoned mem_cgroup before freeing it:
> > 
> > static void __mem_cgroup_free(struct mem_cgroup *memcg)
> > 	for_each_node(node)
> > 	free_mem_cgroup_per_node_info(memcg, node);
> > 	free_percpu(memcg->stat);
> > +       /* poison memcg before freeing it */
> > +       memset(memcg, 0x78, sizeof(struct mem_cgroup));
> > 	kfree(memcg);
> > }
> > 
> > The coredump shows the position=0xdbbc2a00 is freed.
> > 
> > (gdb) p/x ((struct mem_cgroup_per_node *)0xe5009e00)->iter[8]
> > $13 = {position = 0xdbbc2a00, generation = 0x2efd}
> > 
> > 0xdbbc2a00:     0xdbbc2e00      0x00000000      0xdbbc2800      0x00000100
> > 0xdbbc2a10:     0x00000200      0x78787878      0x00026218      0x00000000
> > 0xdbbc2a20:     0xdcad6000      0x00000001      0x78787800      0x00000000
> > 0xdbbc2a30:     0x78780000      0x00000000      0x0068fb84      0x78787878
> > 0xdbbc2a40:     0x78787878      0x78787878      0x78787878      0xe3fa5cc0
> > 0xdbbc2a50:     0x78787878      0x78787878      0x00000000      0x00000000
> > 0xdbbc2a60:     0x00000000      0x00000000      0x00000000      0x00000000
> > 0xdbbc2a70:     0x00000000      0x00000000      0x00000000      0x00000000
> > 0xdbbc2a80:     0x00000000      0x00000000      0x00000000      0x00000000
> > 0xdbbc2a90:     0x00000001      0x00000000      0x00000000      0x00100000
> > 0xdbbc2aa0:     0x00000001      0xdbbc2ac8      0x00000000      0x00000000
> > 0xdbbc2ab0:     0x00000000      0x00000000      0x00000000      0x00000000
> > 0xdbbc2ac0:     0x00000000      0x00000000      0xe5b02618      0x00001000
> > 0xdbbc2ad0:     0x00000000      0x78787878      0x78787878      0x78787878
> > 0xdbbc2ae0:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2af0:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b00:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b10:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b20:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b30:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b40:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b50:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b60:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b70:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2b80:     0x78787878      0x78787878      0x00000000      0x78787878
> > 0xdbbc2b90:     0x78787878      0x78787878      0x78787878      0x78787878
> > 0xdbbc2ba0:     0x78787878      0x78787878      0x78787878      0x78787878
> > 
> > In the reclaim path, try_to_free_pages() does not setup
> > sc.target_mem_cgroup and sc is passed to do_try_to_free_pages(), ...,
> > shrink_node().
> > 
> > In mem_cgroup_iter(), root is set to root_mem_cgroup because
> > sc->target_mem_cgroup is NULL.
> > It is possible to assign a memcg to root_mem_cgroup.nodeinfo.iter in
> > mem_cgroup_iter().
> > 
> > 	try_to_free_pages
> > 		struct scan_control sc = {...}, target_mem_cgroup is 0x0;
> > 	do_try_to_free_pages
> > 	shrink_zones
> > 	shrink_node
> > 		 mem_cgroup *root = sc->target_mem_cgroup;
> > 		 memcg = mem_cgroup_iter(root, NULL, &reclaim);
> > 	mem_cgroup_iter()
> > 		if (!root)
> > 			root = root_mem_cgroup;
> > 		...
> > 
> > 		css = css_next_descendant_pre(css, &root->css);
> > 		memcg = mem_cgroup_from_css(css);
> > 		cmpxchg(&iter->position, pos, memcg);
> > 
> > My device uses memcg non-hierarchical mode.
> > When we release a memcg: invalidate_reclaim_iterators() reaches only
> > dead_memcg and its parents. If non-hierarchical mode is used,
> > invalidate_reclaim_iterators() never reaches root_mem_cgroup.
> > 
> > static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> > {
> > 	struct mem_cgroup *memcg = dead_memcg;
> > 
> > 	for (; memcg; memcg = parent_mem_cgroup(memcg)
> > 	...
> > }
> > 
> > So the use after free scenario looks like:
> > 
> > CPU1						CPU2
> > 
> > try_to_free_pages
> > do_try_to_free_pages
> > shrink_zones
> > shrink_node
> > mem_cgroup_iter()
> >     if (!root)
> >     	root = root_mem_cgroup;
> >     ...
> >     css = css_next_descendant_pre(css, &root->css);
> >     memcg = mem_cgroup_from_css(css);
> >     cmpxchg(&iter->position, pos, memcg);
> > 
> > 					invalidate_reclaim_iterators(memcg);
> > 					...
> > 					__mem_cgroup_free()
> > 						kfree(memcg);
> > 
> > try_to_free_pages
> > do_try_to_free_pages
> > shrink_zones
> > shrink_node
> > mem_cgroup_iter()
> >     if (!root)
> >     	root = root_mem_cgroup;
> >     ...
> >     mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
> >     iter = &mz->iter[reclaim->priority];
> >     pos = READ_ONCE(iter->position);
> >     css_tryget(&pos->css) <- use after free
> > 
> > To avoid this, we should also invalidate root_mem_cgroup.nodeinfo.iter in
> > invalidate_reclaim_iterators().
> > 
> > Change since v1:
> > Add a comment to explain why we need to handle root_mem_cgroup separately.
> > Rename invalid_root to invalidate_root.
> > 
> > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > Signed-off-by: Miles Chen <miles.chen@mediatek.com>
> > ---
> >  mm/memcontrol.c | 38 ++++++++++++++++++++++++++++----------
> >  1 file changed, 28 insertions(+), 10 deletions(-)
> > 
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index cdbb7a84cb6e..09f2191f113b 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -1130,26 +1130,44 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
> >  		css_put(&prev->css);
> >  }
> >  
> > -static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> > +static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
> > +					struct mem_cgroup *dead_memcg)
> >  {
> > -	struct mem_cgroup *memcg = dead_memcg;
> >  	struct mem_cgroup_reclaim_iter *iter;
> >  	struct mem_cgroup_per_node *mz;
> >  	int nid;
> >  	int i;
> >  
> > -	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
> > -		for_each_node(nid) {
> > -			mz = mem_cgroup_nodeinfo(memcg, nid);
> > -			for (i = 0; i <= DEF_PRIORITY; i++) {
> > -				iter = &mz->iter[i];
> > -				cmpxchg(&iter->position,
> > -					dead_memcg, NULL);
> > -			}
> > +	for_each_node(nid) {
> > +		mz = mem_cgroup_nodeinfo(from, nid);
> > +		for (i = 0; i <= DEF_PRIORITY; i++) {
> > +			iter = &mz->iter[i];
> > +			cmpxchg(&iter->position,
> > +				dead_memcg, NULL);
> >  		}
> >  	}
> >  }
> >  
> > +/*
> > + * When cgruop1 non-hierarchy mode is used, parent_mem_cgroup() does
> > + * not walk all the way up to the cgroup root (root_mem_cgroup). So
> > + * we have to handle dead_memcg from cgroup root separately.
> > + */
> > +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> > +{
> > +	struct mem_cgroup *memcg = dead_memcg;
> > +	int invalidate_root = 0;
> > +
> > +	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
> > +		__invalidate_reclaim_iterators(memcg, dead_memcg);
> > +		if (memcg == root_mem_cgroup)
> > +			invalidate_root = 1;
> > +	}
> > +
> > +	if (!invalidate_root)
> > +		__invalidate_reclaim_iterators(root_mem_cgroup, dead_memcg);
> 
> "invalidate_root" suggests we still have to invalidate the root, but
> the variable works the opposite way. How about dropping it altogether
> and moving the comment directly to where the decision is made:
> 
> 	struct mem_cgroup *memcg = dead_memcg;
> 
> 	do {
> 		__invalidate_reclaim_iterators(memcg, dead_memcg);
> 		last = memcg;
> 	} while ((memcg = parent_mem_cgroup(memcg)));
> 
> 	/*
> 	 * When cgruop1 non-hierarchy mode is used,
> 	 * parent_mem_cgroup() does not walk all the way up to the
> 	 * cgroup root (root_mem_cgroup). So we have to handle
> 	 * dead_memcg from cgroup root separately.
> 	 */
> 	if (last != root_mem_cgroup)
> 		__invalidate_reclaim_iterators(root_mem_cgroup, dead_memcg);

Thanks for the suggestion, the code is easier to read this way.
I'll submit patch v4 with this and the fixed tags.


Miles
diff mbox series

Patch

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cdbb7a84cb6e..09f2191f113b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1130,26 +1130,44 @@  void mem_cgroup_iter_break(struct mem_cgroup *root,
 		css_put(&prev->css);
 }
 
-static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
+					struct mem_cgroup *dead_memcg)
 {
-	struct mem_cgroup *memcg = dead_memcg;
 	struct mem_cgroup_reclaim_iter *iter;
 	struct mem_cgroup_per_node *mz;
 	int nid;
 	int i;
 
-	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
-		for_each_node(nid) {
-			mz = mem_cgroup_nodeinfo(memcg, nid);
-			for (i = 0; i <= DEF_PRIORITY; i++) {
-				iter = &mz->iter[i];
-				cmpxchg(&iter->position,
-					dead_memcg, NULL);
-			}
+	for_each_node(nid) {
+		mz = mem_cgroup_nodeinfo(from, nid);
+		for (i = 0; i <= DEF_PRIORITY; i++) {
+			iter = &mz->iter[i];
+			cmpxchg(&iter->position,
+				dead_memcg, NULL);
 		}
 	}
 }
 
+/*
+ * When cgruop1 non-hierarchy mode is used, parent_mem_cgroup() does
+ * not walk all the way up to the cgroup root (root_mem_cgroup). So
+ * we have to handle dead_memcg from cgroup root separately.
+ */
+static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+{
+	struct mem_cgroup *memcg = dead_memcg;
+	int invalidate_root = 0;
+
+	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+		__invalidate_reclaim_iterators(memcg, dead_memcg);
+		if (memcg == root_mem_cgroup)
+			invalidate_root = 1;
+	}
+
+	if (!invalidate_root)
+		__invalidate_reclaim_iterators(root_mem_cgroup, dead_memcg);
+}
+
 /**
  * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
  * @memcg: hierarchy root