diff mbox series

[RFC,1/2] mm: rework memcg kernel stack accounting

Message ID 20180815003620.15678-1-guro@fb.com (mailing list archive)
State New, archived
Headers show
Series [RFC,1/2] mm: rework memcg kernel stack accounting | expand

Commit Message

Roman Gushchin Aug. 15, 2018, 12:36 a.m. UTC
If CONFIG_VMAP_STACK is set, kernel stacks are allocated
using __vmalloc_node_range() with __GFP_ACCOUNT. So kernel
stack pages are charged against corresponding memory cgroups
on allocation and uncharged on releasing them.

The problem is that we do cache kernel stacks in small
per-cpu caches and do reuse them for new tasks, which can
belong to different memory cgroups.

Each stack page still holds a reference to the original cgroup,
so the cgroup can't be released until the vmap area is released.

To make this happen we need more than two subsequent exits
without forks in between on the current cpu, which makes it
very unlikely to happen. As a result, I saw a significant number
of dying cgroups (in theory, up to 2 * number_of_cpu +
number_of_tasks), which can't be released even by significant
memory pressure.

As a cgroup structure can take a significant amount of memory
(first of all, per-cpu data like memcg statistics), it leads
to a noticeable waste of memory.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
---
 kernel/fork.c | 44 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

Comments

Shakeel Butt Aug. 15, 2018, 1:18 a.m. UTC | #1
On Tue, Aug 14, 2018 at 5:37 PM Roman Gushchin <guro@fb.com> wrote:
>
> If CONFIG_VMAP_STACK is set, kernel stacks are allocated
> using __vmalloc_node_range() with __GFP_ACCOUNT. So kernel
> stack pages are charged against corresponding memory cgroups
> on allocation and uncharged on releasing them.
>
> The problem is that we do cache kernel stacks in small
> per-cpu caches and do reuse them for new tasks, which can
> belong to different memory cgroups.
>
> Each stack page still holds a reference to the original cgroup,
> so the cgroup can't be released until the vmap area is released.
>
> To make this happen we need more than two subsequent exits
> without forks in between on the current cpu, which makes it
> very unlikely to happen. As a result, I saw a significant number
> of dying cgroups (in theory, up to 2 * number_of_cpu +
> number_of_tasks), which can't be released even by significant
> memory pressure.
>
> As a cgroup structure can take a significant amount of memory
> (first of all, per-cpu data like memcg statistics), it leads
> to a noticeable waste of memory.
>
> Signed-off-by: Roman Gushchin <guro@fb.com>

I was also looking into this issue. I was thinking of having a
per-memcg per-cpu stack cache. However this solution seems much
simpler. Can you also add the performance number for a similar simple
benchmark done in ac496bf48d97 ("fork: Optimize task creation by
caching two thread stacks per CPU if CONFIG_VMAP_STACK=y").

Reviewed-by: Shakeel Butt <shakeelb@google.com>

> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: Andy Lutomirski <luto@kernel.org>
> Cc: Konstantin Khlebnikov <koct9i@gmail.com>
> Cc: Tejun Heo <tj@kernel.org>
> ---
>  kernel/fork.c | 44 ++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 38 insertions(+), 6 deletions(-)
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 69b6fea5a181..91872b2b37bd 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
>                 return s->addr;
>         }
>
> +       /*
> +        * Allocated stacks are cached and later reused by new threads,
> +        * so memcg accounting is performed manually on assigning/releasing
> +        * stacks to tasks. Drop __GFP_ACCOUNT.
> +        */
>         stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
>                                      VMALLOC_START, VMALLOC_END,
> -                                    THREADINFO_GFP,
> +                                    THREADINFO_GFP & ~__GFP_ACCOUNT,
>                                      PAGE_KERNEL,
>                                      0, node, __builtin_return_address(0));
>
> @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
>  #endif
>  }
>
> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> +{
> +#ifdef CONFIG_VMAP_STACK
> +       struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> +       if (vm) {
> +               int i;
> +
> +               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> +                       memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> +                                         compound_order(vm->pages[i]));
> +
> +               /* All stack pages belong to the same memcg. */
> +               mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> +                                    THREAD_SIZE / 1024);
> +       }
> +#endif
> +}
> +
>  static inline void free_thread_stack(struct task_struct *tsk)
>  {
>  #ifdef CONFIG_VMAP_STACK
> -       if (task_stack_vm_area(tsk)) {
> +       struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> +       if (vm) {
>                 int i;
>
> +               /* All stack pages belong to the same memcg. */
> +               mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> +                                    -(int)(THREAD_SIZE / 1024));
> +
> +               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> +                       memcg_kmem_uncharge(vm->pages[i],
> +                                         compound_order(vm->pages[i]));
> +
>                 for (i = 0; i < NR_CACHED_STACKS; i++) {
>                         if (this_cpu_cmpxchg(cached_stacks[i],
>                                         NULL, tsk->stack_vm_area) != NULL)
> @@ -352,10 +386,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
>                                             NR_KERNEL_STACK_KB,
>                                             PAGE_SIZE / 1024 * account);
>                 }
> -
> -               /* All stack pages belong to the same memcg. */
> -               mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> -                                    account * (THREAD_SIZE / 1024));
>         } else {
>                 /*
>                  * All stack pages are in the same zone and belong to the
> @@ -809,6 +839,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
>         if (!stack)
>                 goto free_tsk;
>
> +       memcg_charge_kernel_stack(tsk);
> +
>         stack_vm_area = task_stack_vm_area(tsk);
>
>         err = arch_dup_task_struct(tsk, orig);
> --
> 2.14.4
>
Michal Hocko Aug. 15, 2018, 7:10 a.m. UTC | #2
On Tue 14-08-18 17:36:19, Roman Gushchin wrote:
> If CONFIG_VMAP_STACK is set, kernel stacks are allocated
> using __vmalloc_node_range() with __GFP_ACCOUNT. So kernel
> stack pages are charged against corresponding memory cgroups
> on allocation and uncharged on releasing them.
> 
> The problem is that we do cache kernel stacks in small
> per-cpu caches and do reuse them for new tasks, which can
> belong to different memory cgroups.
> 
> Each stack page still holds a reference to the original cgroup,
> so the cgroup can't be released until the vmap area is released.
> 
> To make this happen we need more than two subsequent exits
> without forks in between on the current cpu, which makes it
> very unlikely to happen. As a result, I saw a significant number
> of dying cgroups (in theory, up to 2 * number_of_cpu +
> number_of_tasks), which can't be released even by significant
> memory pressure.
> 
> As a cgroup structure can take a significant amount of memory
> (first of all, per-cpu data like memcg statistics), it leads
> to a noticeable waste of memory.
> 
> Signed-off-by: Roman Gushchin <guro@fb.com>

Fixes: ac496bf48d97 ("fork: Optimize task creation by caching two thread stacks per CPU if CONFIG_VMAP_STACK=y")
AFAICS

> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: Andy Lutomirski <luto@kernel.org>
> Cc: Konstantin Khlebnikov <koct9i@gmail.com>
> Cc: Tejun Heo <tj@kernel.org>

Yes this is the right way to do accounting here.
Acked-by: Michal Hocko <mhocko@suse.com>

Thanks!

> ---
>  kernel/fork.c | 44 ++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 38 insertions(+), 6 deletions(-)
> 
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 69b6fea5a181..91872b2b37bd 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
>  		return s->addr;
>  	}
>  
> +	/*
> +	 * Allocated stacks are cached and later reused by new threads,
> +	 * so memcg accounting is performed manually on assigning/releasing
> +	 * stacks to tasks. Drop __GFP_ACCOUNT.
> +	 */
>  	stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
>  				     VMALLOC_START, VMALLOC_END,
> -				     THREADINFO_GFP,
> +				     THREADINFO_GFP & ~__GFP_ACCOUNT,
>  				     PAGE_KERNEL,
>  				     0, node, __builtin_return_address(0));
>  
> @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
>  #endif
>  }
>  
> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> +{
> +#ifdef CONFIG_VMAP_STACK
> +	struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> +	if (vm) {
> +		int i;
> +
> +		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> +			memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> +					  compound_order(vm->pages[i]));
> +
> +		/* All stack pages belong to the same memcg. */
> +		mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> +				     THREAD_SIZE / 1024);
> +	}
> +#endif
> +}
> +
>  static inline void free_thread_stack(struct task_struct *tsk)
>  {
>  #ifdef CONFIG_VMAP_STACK
> -	if (task_stack_vm_area(tsk)) {
> +	struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> +	if (vm) {
>  		int i;
>  
> +		/* All stack pages belong to the same memcg. */
> +		mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> +				     -(int)(THREAD_SIZE / 1024));
> +
> +		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> +			memcg_kmem_uncharge(vm->pages[i],
> +					  compound_order(vm->pages[i]));
> +
>  		for (i = 0; i < NR_CACHED_STACKS; i++) {
>  			if (this_cpu_cmpxchg(cached_stacks[i],
>  					NULL, tsk->stack_vm_area) != NULL)
> @@ -352,10 +386,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
>  					    NR_KERNEL_STACK_KB,
>  					    PAGE_SIZE / 1024 * account);
>  		}
> -
> -		/* All stack pages belong to the same memcg. */
> -		mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> -				     account * (THREAD_SIZE / 1024));
>  	} else {
>  		/*
>  		 * All stack pages are in the same zone and belong to the
> @@ -809,6 +839,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
>  	if (!stack)
>  		goto free_tsk;
>  
> +	memcg_charge_kernel_stack(tsk);
> +
>  	stack_vm_area = task_stack_vm_area(tsk);
>  
>  	err = arch_dup_task_struct(tsk, orig);
> -- 
> 2.14.4
Johannes Weiner Aug. 15, 2018, 4:39 p.m. UTC | #3
On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
>  		return s->addr;
>  	}
>  
> +	/*
> +	 * Allocated stacks are cached and later reused by new threads,
> +	 * so memcg accounting is performed manually on assigning/releasing
> +	 * stacks to tasks. Drop __GFP_ACCOUNT.
> +	 */
>  	stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
>  				     VMALLOC_START, VMALLOC_END,
> -				     THREADINFO_GFP,
> +				     THREADINFO_GFP & ~__GFP_ACCOUNT,
>  				     PAGE_KERNEL,
>  				     0, node, __builtin_return_address(0));
>  
> @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
>  #endif
>  }
>  
> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> +{
> +#ifdef CONFIG_VMAP_STACK
> +	struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> +	if (vm) {
> +		int i;
> +
> +		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> +			memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> +					  compound_order(vm->pages[i]));
> +
> +		/* All stack pages belong to the same memcg. */
> +		mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> +				     THREAD_SIZE / 1024);
> +	}
> +#endif
> +}

Before this change, the memory limit can fail the fork, but afterwards
fork() can grow memory consumption unimpeded by the cgroup settings.

Can we continue to use try_charge() here and fail the fork?
Roman Gushchin Aug. 15, 2018, 4:55 p.m. UTC | #4
On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> > @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
> >  		return s->addr;
> >  	}
> >  
> > +	/*
> > +	 * Allocated stacks are cached and later reused by new threads,
> > +	 * so memcg accounting is performed manually on assigning/releasing
> > +	 * stacks to tasks. Drop __GFP_ACCOUNT.
> > +	 */
> >  	stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> >  				     VMALLOC_START, VMALLOC_END,
> > -				     THREADINFO_GFP,
> > +				     THREADINFO_GFP & ~__GFP_ACCOUNT,
> >  				     PAGE_KERNEL,
> >  				     0, node, __builtin_return_address(0));
> >  
> > @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
> >  #endif
> >  }
> >  
> > +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> > +{
> > +#ifdef CONFIG_VMAP_STACK
> > +	struct vm_struct *vm = task_stack_vm_area(tsk);
> > +
> > +	if (vm) {
> > +		int i;
> > +
> > +		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> > +			memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> > +					  compound_order(vm->pages[i]));
> > +
> > +		/* All stack pages belong to the same memcg. */
> > +		mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> > +				     THREAD_SIZE / 1024);
> > +	}
> > +#endif
> > +}
> 
> Before this change, the memory limit can fail the fork, but afterwards
> fork() can grow memory consumption unimpeded by the cgroup settings.
> 
> Can we continue to use try_charge() here and fail the fork?

We can, but I'm not convinced we should.

Kernel stack is relatively small, and it's already allocated at this point.
So IMO exceeding the memcg limit for 1-2 pages isn't worse than
adding complexity and handle this case (e.g. uncharge partially
charged stack). Do you have an example, when it does matter?
Andy Lutomirski Aug. 15, 2018, 5:12 p.m. UTC | #5
> On Aug 15, 2018, at 9:55 AM, Roman Gushchin <guro@fb.com> wrote:
> 
>> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
>>> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
>>> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
>>>        return s->addr;
>>>    }
>>> 
>>> +    /*
>>> +     * Allocated stacks are cached and later reused by new threads,
>>> +     * so memcg accounting is performed manually on assigning/releasing
>>> +     * stacks to tasks. Drop __GFP_ACCOUNT.
>>> +     */
>>>    stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
>>>                     VMALLOC_START, VMALLOC_END,
>>> -                     THREADINFO_GFP,
>>> +                     THREADINFO_GFP & ~__GFP_ACCOUNT,
>>>                     PAGE_KERNEL,
>>>                     0, node, __builtin_return_address(0));
>>> 
>>> @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
>>> #endif
>>> }
>>> 
>>> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
>>> +{
>>> +#ifdef CONFIG_VMAP_STACK
>>> +    struct vm_struct *vm = task_stack_vm_area(tsk);
>>> +
>>> +    if (vm) {
>>> +        int i;
>>> +
>>> +        for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
>>> +            memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
>>> +                      compound_order(vm->pages[i]));
>>> +
>>> +        /* All stack pages belong to the same memcg. */
>>> +        mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
>>> +                     THREAD_SIZE / 1024);
>>> +    }
>>> +#endif
>>> +}
>> 
>> Before this change, the memory limit can fail the fork, but afterwards
>> fork() can grow memory consumption unimpeded by the cgroup settings.
>> 
>> Can we continue to use try_charge() here and fail the fork?
> 
> We can, but I'm not convinced we should.
> 
> Kernel stack is relatively small, and it's already allocated at this point.
> So IMO exceeding the memcg limit for 1-2 pages isn't worse than
> adding complexity and handle this case (e.g. uncharge partially
> charged stack). Do you have an example, when it does matter?

What bounds it to just a few pages?  Couldn’t there be lots of forks in flight that all hit this path?  It’s unlikely, and there are surely easier DoS vectors, but still.
Roman Gushchin Aug. 15, 2018, 5:16 p.m. UTC | #6
On Tue, Aug 14, 2018 at 06:18:01PM -0700, Shakeel Butt wrote:
> On Tue, Aug 14, 2018 at 5:37 PM Roman Gushchin <guro@fb.com> wrote:
> >
> > If CONFIG_VMAP_STACK is set, kernel stacks are allocated
> > using __vmalloc_node_range() with __GFP_ACCOUNT. So kernel
> > stack pages are charged against corresponding memory cgroups
> > on allocation and uncharged on releasing them.
> >
> > The problem is that we do cache kernel stacks in small
> > per-cpu caches and do reuse them for new tasks, which can
> > belong to different memory cgroups.
> >
> > Each stack page still holds a reference to the original cgroup,
> > so the cgroup can't be released until the vmap area is released.
> >
> > To make this happen we need more than two subsequent exits
> > without forks in between on the current cpu, which makes it
> > very unlikely to happen. As a result, I saw a significant number
> > of dying cgroups (in theory, up to 2 * number_of_cpu +
> > number_of_tasks), which can't be released even by significant
> > memory pressure.
> >
> > As a cgroup structure can take a significant amount of memory
> > (first of all, per-cpu data like memcg statistics), it leads
> > to a noticeable waste of memory.
> >
> > Signed-off-by: Roman Gushchin <guro@fb.com>
> 
> I was also looking into this issue. I was thinking of having a
> per-memcg per-cpu stack cache. However this solution seems much
> simpler.

I also thought about having per-memcg stack cache, but it seems
that caching 2 * n(cpus) * n(cgroups) stacks is an overkill,
and there is nothing memcg-specific in these stacks except
that they are pre-charged.

> Can you also add the performance number for a similar simple
> benchmark done in ac496bf48d97 ("fork: Optimize task creation by
> caching two thread stacks per CPU if CONFIG_VMAP_STACK=y").

Sure, will do in v2.

> 
> Reviewed-by: Shakeel Butt <shakeelb@google.com>

Thanks!

> 
> > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > Cc: Michal Hocko <mhocko@kernel.org>
> > Cc: Andy Lutomirski <luto@kernel.org>
> > Cc: Konstantin Khlebnikov <koct9i@gmail.com>
> > Cc: Tejun Heo <tj@kernel.org>
> > ---
> >  kernel/fork.c | 44 ++++++++++++++++++++++++++++++++++++++------
> >  1 file changed, 38 insertions(+), 6 deletions(-)
> >
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 69b6fea5a181..91872b2b37bd 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
> >                 return s->addr;
> >         }
> >
> > +       /*
> > +        * Allocated stacks are cached and later reused by new threads,
> > +        * so memcg accounting is performed manually on assigning/releasing
> > +        * stacks to tasks. Drop __GFP_ACCOUNT.
> > +        */
> >         stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> >                                      VMALLOC_START, VMALLOC_END,
> > -                                    THREADINFO_GFP,
> > +                                    THREADINFO_GFP & ~__GFP_ACCOUNT,
> >                                      PAGE_KERNEL,
> >                                      0, node, __builtin_return_address(0));
> >
> > @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
> >  #endif
> >  }
> >
> > +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> > +{
> > +#ifdef CONFIG_VMAP_STACK
> > +       struct vm_struct *vm = task_stack_vm_area(tsk);
> > +
> > +       if (vm) {
> > +               int i;
> > +
> > +               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> > +                       memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> > +                                         compound_order(vm->pages[i]));
> > +
> > +               /* All stack pages belong to the same memcg. */
> > +               mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> > +                                    THREAD_SIZE / 1024);
> > +       }
> > +#endif
> > +}
> > +
> >  static inline void free_thread_stack(struct task_struct *tsk)
> >  {
> >  #ifdef CONFIG_VMAP_STACK
> > -       if (task_stack_vm_area(tsk)) {
> > +       struct vm_struct *vm = task_stack_vm_area(tsk);
> > +
> > +       if (vm) {
> >                 int i;
> >
> > +               /* All stack pages belong to the same memcg. */
> > +               mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> > +                                    -(int)(THREAD_SIZE / 1024));
> > +
> > +               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> > +                       memcg_kmem_uncharge(vm->pages[i],
> > +                                         compound_order(vm->pages[i]));
> > +
> >                 for (i = 0; i < NR_CACHED_STACKS; i++) {
> >                         if (this_cpu_cmpxchg(cached_stacks[i],
> >                                         NULL, tsk->stack_vm_area) != NULL)
> > @@ -352,10 +386,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
> >                                             NR_KERNEL_STACK_KB,
> >                                             PAGE_SIZE / 1024 * account);
> >                 }
> > -
> > -               /* All stack pages belong to the same memcg. */
> > -               mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> > -                                    account * (THREAD_SIZE / 1024));
> >         } else {
> >                 /*
> >                  * All stack pages are in the same zone and belong to the
> > @@ -809,6 +839,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
> >         if (!stack)
> >                 goto free_tsk;
> >
> > +       memcg_charge_kernel_stack(tsk);
> > +
> >         stack_vm_area = task_stack_vm_area(tsk);
> >
> >         err = arch_dup_task_struct(tsk, orig);
> > --
> > 2.14.4
> >
Johannes Weiner Aug. 15, 2018, 5:20 p.m. UTC | #7
On Wed, Aug 15, 2018 at 09:55:17AM -0700, Roman Gushchin wrote:
> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
> > On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> > > @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
> > >  		return s->addr;
> > >  	}
> > >  
> > > +	/*
> > > +	 * Allocated stacks are cached and later reused by new threads,
> > > +	 * so memcg accounting is performed manually on assigning/releasing
> > > +	 * stacks to tasks. Drop __GFP_ACCOUNT.
> > > +	 */
> > >  	stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> > >  				     VMALLOC_START, VMALLOC_END,
> > > -				     THREADINFO_GFP,
> > > +				     THREADINFO_GFP & ~__GFP_ACCOUNT,
> > >  				     PAGE_KERNEL,
> > >  				     0, node, __builtin_return_address(0));
> > >  
> > > @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
> > >  #endif
> > >  }
> > >  
> > > +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> > > +{
> > > +#ifdef CONFIG_VMAP_STACK
> > > +	struct vm_struct *vm = task_stack_vm_area(tsk);
> > > +
> > > +	if (vm) {
> > > +		int i;
> > > +
> > > +		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> > > +			memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> > > +					  compound_order(vm->pages[i]));
> > > +
> > > +		/* All stack pages belong to the same memcg. */
> > > +		mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> > > +				     THREAD_SIZE / 1024);
> > > +	}
> > > +#endif
> > > +}
> > 
> > Before this change, the memory limit can fail the fork, but afterwards
> > fork() can grow memory consumption unimpeded by the cgroup settings.
> > 
> > Can we continue to use try_charge() here and fail the fork?
> 
> We can, but I'm not convinced we should.
> 
> Kernel stack is relatively small, and it's already allocated at this point.
> So IMO exceeding the memcg limit for 1-2 pages isn't worse than
> adding complexity and handle this case (e.g. uncharge partially
> charged stack). Do you have an example, when it does matter?

This is completely backwards.

We respect the limits unless there is a *really* strong reason not
to. The only situations I can think of is during OOM kills to avoid
memory deadlocks and during packet reception for correctness issues
(and because the network stack has its own way to reclaim memory).

Relying on some vague future allocations in the process's lifetime to
fail in order to contain it is crappy and unreliable. And unwinding
the stack allocation isn't too much complexity to warrant breaking the
containment rules here, even if it were several steps. But it looks
like it's nothing more than a 'goto free_stack'.

Please just fix this.
Roman Gushchin Aug. 15, 2018, 5:25 p.m. UTC | #8
On Wed, Aug 15, 2018 at 10:12:42AM -0700, Andy Lutomirski wrote:
> 
> 
> > On Aug 15, 2018, at 9:55 AM, Roman Gushchin <guro@fb.com> wrote:
> > 
> >> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
> >>> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> >>> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
> >>>        return s->addr;
> >>>    }
> >>> 
> >>> +    /*
> >>> +     * Allocated stacks are cached and later reused by new threads,
> >>> +     * so memcg accounting is performed manually on assigning/releasing
> >>> +     * stacks to tasks. Drop __GFP_ACCOUNT.
> >>> +     */
> >>>    stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> >>>                     VMALLOC_START, VMALLOC_END,
> >>> -                     THREADINFO_GFP,
> >>> +                     THREADINFO_GFP & ~__GFP_ACCOUNT,
> >>>                     PAGE_KERNEL,
> >>>                     0, node, __builtin_return_address(0));
> >>> 
> >>> @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
> >>> #endif
> >>> }
> >>> 
> >>> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> >>> +{
> >>> +#ifdef CONFIG_VMAP_STACK
> >>> +    struct vm_struct *vm = task_stack_vm_area(tsk);
> >>> +
> >>> +    if (vm) {
> >>> +        int i;
> >>> +
> >>> +        for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> >>> +            memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> >>> +                      compound_order(vm->pages[i]));
> >>> +
> >>> +        /* All stack pages belong to the same memcg. */
> >>> +        mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> >>> +                     THREAD_SIZE / 1024);
> >>> +    }
> >>> +#endif
> >>> +}
> >> 
> >> Before this change, the memory limit can fail the fork, but afterwards
> >> fork() can grow memory consumption unimpeded by the cgroup settings.
> >> 
> >> Can we continue to use try_charge() here and fail the fork?
> > 
> > We can, but I'm not convinced we should.
> > 
> > Kernel stack is relatively small, and it's already allocated at this point.
> > So IMO exceeding the memcg limit for 1-2 pages isn't worse than
> > adding complexity and handle this case (e.g. uncharge partially
> > charged stack). Do you have an example, when it does matter?
> 
> What bounds it to just a few pages?  Couldn’t there be lots of forks in flight that all hit this path?  It’s unlikely, and there are surely easier DoS vectors, but still.

Because any following memcg-aware allocation will fail.
There is also the pid cgroup controlled which can be used to limit the number
of forks.

Anyway, I'm ok to handle the this case and fail fork,
if you think it does matter.
Shakeel Butt Aug. 15, 2018, 5:32 p.m. UTC | #9
On Wed, Aug 15, 2018 at 10:26 AM Roman Gushchin <guro@fb.com> wrote:
>
> On Wed, Aug 15, 2018 at 10:12:42AM -0700, Andy Lutomirski wrote:
> >
> >
> > > On Aug 15, 2018, at 9:55 AM, Roman Gushchin <guro@fb.com> wrote:
> > >
> > >> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
> > >>> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> > >>> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
> > >>>        return s->addr;
> > >>>    }
> > >>>
> > >>> +    /*
> > >>> +     * Allocated stacks are cached and later reused by new threads,
> > >>> +     * so memcg accounting is performed manually on assigning/releasing
> > >>> +     * stacks to tasks. Drop __GFP_ACCOUNT.
> > >>> +     */
> > >>>    stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> > >>>                     VMALLOC_START, VMALLOC_END,
> > >>> -                     THREADINFO_GFP,
> > >>> +                     THREADINFO_GFP & ~__GFP_ACCOUNT,
> > >>>                     PAGE_KERNEL,
> > >>>                     0, node, __builtin_return_address(0));
> > >>>
> > >>> @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
> > >>> #endif
> > >>> }
> > >>>
> > >>> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> > >>> +{
> > >>> +#ifdef CONFIG_VMAP_STACK
> > >>> +    struct vm_struct *vm = task_stack_vm_area(tsk);
> > >>> +
> > >>> +    if (vm) {
> > >>> +        int i;
> > >>> +
> > >>> +        for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> > >>> +            memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> > >>> +                      compound_order(vm->pages[i]));
> > >>> +
> > >>> +        /* All stack pages belong to the same memcg. */
> > >>> +        mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> > >>> +                     THREAD_SIZE / 1024);
> > >>> +    }
> > >>> +#endif
> > >>> +}
> > >>
> > >> Before this change, the memory limit can fail the fork, but afterwards
> > >> fork() can grow memory consumption unimpeded by the cgroup settings.
> > >>
> > >> Can we continue to use try_charge() here and fail the fork?
> > >
> > > We can, but I'm not convinced we should.
> > >
> > > Kernel stack is relatively small, and it's already allocated at this point.
> > > So IMO exceeding the memcg limit for 1-2 pages isn't worse than
> > > adding complexity and handle this case (e.g. uncharge partially
> > > charged stack). Do you have an example, when it does matter?
> >
> > What bounds it to just a few pages?  Couldn’t there be lots of forks in flight that all hit this path?  It’s unlikely, and there are surely easier DoS vectors, but still.
>
> Because any following memcg-aware allocation will fail.
> There is also the pid cgroup controlled which can be used to limit the number
> of forks.
>
> Anyway, I'm ok to handle the this case and fail fork,
> if you think it does matter.

Roman, before adding more changes do benchmark this. Maybe disabling
the stack caching for CONFIG_MEMCG is much cleaner.

Shakeel
Andy Lutomirski Aug. 15, 2018, 5:37 p.m. UTC | #10
> On Aug 15, 2018, at 10:32 AM, Shakeel Butt <shakeelb@google.com> wrote:
> 
>> On Wed, Aug 15, 2018 at 10:26 AM Roman Gushchin <guro@fb.com> wrote:
>> 
>>> On Wed, Aug 15, 2018 at 10:12:42AM -0700, Andy Lutomirski wrote:
>>> 
>>> 
>>>>> On Aug 15, 2018, at 9:55 AM, Roman Gushchin <guro@fb.com> wrote:
>>>>> 
>>>>>> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
>>>>>> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
>>>>>> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
>>>>>>       return s->addr;
>>>>>>   }
>>>>>> 
>>>>>> +    /*
>>>>>> +     * Allocated stacks are cached and later reused by new threads,
>>>>>> +     * so memcg accounting is performed manually on assigning/releasing
>>>>>> +     * stacks to tasks. Drop __GFP_ACCOUNT.
>>>>>> +     */
>>>>>>   stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
>>>>>>                    VMALLOC_START, VMALLOC_END,
>>>>>> -                     THREADINFO_GFP,
>>>>>> +                     THREADINFO_GFP & ~__GFP_ACCOUNT,
>>>>>>                    PAGE_KERNEL,
>>>>>>                    0, node, __builtin_return_address(0));
>>>>>> 
>>>>>> @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
>>>>>> #endif
>>>>>> }
>>>>>> 
>>>>>> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
>>>>>> +{
>>>>>> +#ifdef CONFIG_VMAP_STACK
>>>>>> +    struct vm_struct *vm = task_stack_vm_area(tsk);
>>>>>> +
>>>>>> +    if (vm) {
>>>>>> +        int i;
>>>>>> +
>>>>>> +        for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
>>>>>> +            memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
>>>>>> +                      compound_order(vm->pages[i]));
>>>>>> +
>>>>>> +        /* All stack pages belong to the same memcg. */
>>>>>> +        mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
>>>>>> +                     THREAD_SIZE / 1024);
>>>>>> +    }
>>>>>> +#endif
>>>>>> +}
>>>>> 
>>>>> Before this change, the memory limit can fail the fork, but afterwards
>>>>> fork() can grow memory consumption unimpeded by the cgroup settings.
>>>>> 
>>>>> Can we continue to use try_charge() here and fail the fork?
>>>> 
>>>> We can, but I'm not convinced we should.
>>>> 
>>>> Kernel stack is relatively small, and it's already allocated at this point.
>>>> So IMO exceeding the memcg limit for 1-2 pages isn't worse than
>>>> adding complexity and handle this case (e.g. uncharge partially
>>>> charged stack). Do you have an example, when it does matter?
>>> 
>>> What bounds it to just a few pages?  Couldn’t there be lots of forks in flight that all hit this path?  It’s unlikely, and there are surely easier DoS vectors, but still.
>> 
>> Because any following memcg-aware allocation will fail.
>> There is also the pid cgroup controlled which can be used to limit the number
>> of forks.
>> 
>> Anyway, I'm ok to handle the this case and fail fork,
>> if you think it does matter.
> 
> Roman, before adding more changes do benchmark this. Maybe disabling
> the stack caching for CONFIG_MEMCG is much cleaner.
> 
> 

Unless memcg accounting is colossally slow, the caching should be left on. vmalloc() isn’t inherently slow, but vfree() is, since we need to do a global broadcast TLB flush after enough vfree() calls.
Michal Hocko Aug. 16, 2018, 6:35 a.m. UTC | #11
On Wed 15-08-18 13:20:44, Johannes Weiner wrote:
[...]
> This is completely backwards.
> 
> We respect the limits unless there is a *really* strong reason not
> to. The only situations I can think of is during OOM kills to avoid
> memory deadlocks and during packet reception for correctness issues
> (and because the network stack has its own way to reclaim memory).
> 
> Relying on some vague future allocations in the process's lifetime to
> fail in order to contain it is crappy and unreliable. And unwinding
> the stack allocation isn't too much complexity to warrant breaking the
> containment rules here, even if it were several steps. But it looks
> like it's nothing more than a 'goto free_stack'.
> 
> Please just fix this.

Thinking about it some more (sorry I should have done that in my
previous reply already) I do agree with Johannes. We should really back
off as soon as possible rather than rely on a future action because
this is quite subtle and prone to unexpected behavior.
Roman Gushchin Aug. 16, 2018, 3:24 p.m. UTC | #12
On Thu, Aug 16, 2018 at 08:35:09AM +0200, Michal Hocko wrote:
> On Wed 15-08-18 13:20:44, Johannes Weiner wrote:
> [...]
> > This is completely backwards.
> > 
> > We respect the limits unless there is a *really* strong reason not
> > to. The only situations I can think of is during OOM kills to avoid
> > memory deadlocks and during packet reception for correctness issues
> > (and because the network stack has its own way to reclaim memory).
> > 
> > Relying on some vague future allocations in the process's lifetime to
> > fail in order to contain it is crappy and unreliable. And unwinding
> > the stack allocation isn't too much complexity to warrant breaking the
> > containment rules here, even if it were several steps. But it looks
> > like it's nothing more than a 'goto free_stack'.
> > 
> > Please just fix this.
> 
> Thinking about it some more (sorry I should have done that in my
> previous reply already) I do agree with Johannes. We should really back
> off as soon as possible rather than rely on a future action because
> this is quite subtle and prone to unexpected behavior.

Ok, no problems, I'll address this in v2.

Thanks!
Roman Gushchin Aug. 21, 2018, 5:22 p.m. UTC | #13
On Wed, Aug 15, 2018 at 10:37:28AM -0700, Andy Lutomirski wrote:
> 
> 
> > On Aug 15, 2018, at 10:32 AM, Shakeel Butt <shakeelb@google.com> wrote:
> > 
> >> On Wed, Aug 15, 2018 at 10:26 AM Roman Gushchin <guro@fb.com> wrote:
> >> 
> >>> On Wed, Aug 15, 2018 at 10:12:42AM -0700, Andy Lutomirski wrote:
> >>> 
> >>> 
> >>>>> On Aug 15, 2018, at 9:55 AM, Roman Gushchin <guro@fb.com> wrote:
> >>>>> 
> >>>>>> On Wed, Aug 15, 2018 at 12:39:23PM -0400, Johannes Weiner wrote:
> >>>>>> On Tue, Aug 14, 2018 at 05:36:19PM -0700, Roman Gushchin wrote:
> >>>>>> @@ -224,9 +224,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
> >>>>>>       return s->addr;
> >>>>>>   }
> >>>>>> 
> >>>>>> +    /*
> >>>>>> +     * Allocated stacks are cached and later reused by new threads,
> >>>>>> +     * so memcg accounting is performed manually on assigning/releasing
> >>>>>> +     * stacks to tasks. Drop __GFP_ACCOUNT.
> >>>>>> +     */
> >>>>>>   stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
> >>>>>>                    VMALLOC_START, VMALLOC_END,
> >>>>>> -                     THREADINFO_GFP,
> >>>>>> +                     THREADINFO_GFP & ~__GFP_ACCOUNT,
> >>>>>>                    PAGE_KERNEL,
> >>>>>>                    0, node, __builtin_return_address(0));
> >>>>>> 
> >>>>>> @@ -246,12 +251,41 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
> >>>>>> #endif
> >>>>>> }
> >>>>>> 
> >>>>>> +static void memcg_charge_kernel_stack(struct task_struct *tsk)
> >>>>>> +{
> >>>>>> +#ifdef CONFIG_VMAP_STACK
> >>>>>> +    struct vm_struct *vm = task_stack_vm_area(tsk);
> >>>>>> +
> >>>>>> +    if (vm) {
> >>>>>> +        int i;
> >>>>>> +
> >>>>>> +        for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
> >>>>>> +            memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
> >>>>>> +                      compound_order(vm->pages[i]));
> >>>>>> +
> >>>>>> +        /* All stack pages belong to the same memcg. */
> >>>>>> +        mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> >>>>>> +                     THREAD_SIZE / 1024);
> >>>>>> +    }
> >>>>>> +#endif
> >>>>>> +}
> >>>>> 
> >>>>> Before this change, the memory limit can fail the fork, but afterwards
> >>>>> fork() can grow memory consumption unimpeded by the cgroup settings.
> >>>>> 
> >>>>> Can we continue to use try_charge() here and fail the fork?
> >>>> 
> >>>> We can, but I'm not convinced we should.
> >>>> 
> >>>> Kernel stack is relatively small, and it's already allocated at this point.
> >>>> So IMO exceeding the memcg limit for 1-2 pages isn't worse than
> >>>> adding complexity and handle this case (e.g. uncharge partially
> >>>> charged stack). Do you have an example, when it does matter?
> >>> 
> >>> What bounds it to just a few pages?  Couldn’t there be lots of forks in flight that all hit this path?  It’s unlikely, and there are surely easier DoS vectors, but still.
> >> 
> >> Because any following memcg-aware allocation will fail.
> >> There is also the pid cgroup controlled which can be used to limit the number
> >> of forks.
> >> 
> >> Anyway, I'm ok to handle the this case and fail fork,
> >> if you think it does matter.
> > 
> > Roman, before adding more changes do benchmark this. Maybe disabling
> > the stack caching for CONFIG_MEMCG is much cleaner.
> > 
> > 
> 
> Unless memcg accounting is colossally slow, the caching should be left on. vmalloc() isn’t inherently slow, but vfree() is, since we need to do a global broadcast TLB flush after enough vfree() calls.

It's not.

BTW, is the test, which you used to measure the performance
gains of stack caching, available publicly?

Thanks!
diff mbox series

Patch

diff --git a/kernel/fork.c b/kernel/fork.c
index 69b6fea5a181..91872b2b37bd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -224,9 +224,14 @@  static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 		return s->addr;
 	}
 
+	/*
+	 * Allocated stacks are cached and later reused by new threads,
+	 * so memcg accounting is performed manually on assigning/releasing
+	 * stacks to tasks. Drop __GFP_ACCOUNT.
+	 */
 	stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
 				     VMALLOC_START, VMALLOC_END,
-				     THREADINFO_GFP,
+				     THREADINFO_GFP & ~__GFP_ACCOUNT,
 				     PAGE_KERNEL,
 				     0, node, __builtin_return_address(0));
 
@@ -246,12 +251,41 @@  static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 #endif
 }
 
+static void memcg_charge_kernel_stack(struct task_struct *tsk)
+{
+#ifdef CONFIG_VMAP_STACK
+	struct vm_struct *vm = task_stack_vm_area(tsk);
+
+	if (vm) {
+		int i;
+
+		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+			memcg_kmem_charge(vm->pages[i], __GFP_NOFAIL,
+					  compound_order(vm->pages[i]));
+
+		/* All stack pages belong to the same memcg. */
+		mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+				     THREAD_SIZE / 1024);
+	}
+#endif
+}
+
 static inline void free_thread_stack(struct task_struct *tsk)
 {
 #ifdef CONFIG_VMAP_STACK
-	if (task_stack_vm_area(tsk)) {
+	struct vm_struct *vm = task_stack_vm_area(tsk);
+
+	if (vm) {
 		int i;
 
+		/* All stack pages belong to the same memcg. */
+		mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+				     -(int)(THREAD_SIZE / 1024));
+
+		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+			memcg_kmem_uncharge(vm->pages[i],
+					  compound_order(vm->pages[i]));
+
 		for (i = 0; i < NR_CACHED_STACKS; i++) {
 			if (this_cpu_cmpxchg(cached_stacks[i],
 					NULL, tsk->stack_vm_area) != NULL)
@@ -352,10 +386,6 @@  static void account_kernel_stack(struct task_struct *tsk, int account)
 					    NR_KERNEL_STACK_KB,
 					    PAGE_SIZE / 1024 * account);
 		}
-
-		/* All stack pages belong to the same memcg. */
-		mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
-				     account * (THREAD_SIZE / 1024));
 	} else {
 		/*
 		 * All stack pages are in the same zone and belong to the
@@ -809,6 +839,8 @@  static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	if (!stack)
 		goto free_tsk;
 
+	memcg_charge_kernel_stack(tsk);
+
 	stack_vm_area = task_stack_vm_area(tsk);
 
 	err = arch_dup_task_struct(tsk, orig);