diff mbox

[v2,06/13] fork: Add generic vmalloced stack support

Message ID 44f658aacbabd9d1689b3e0aae60ee8746881eff.1466192946.git.luto@kernel.org (mailing list archive)
State New, archived
Headers show

Commit Message

Andy Lutomirski June 17, 2016, 8 p.m. UTC
If CONFIG_VMAP_STACK is selected, kernel stacks are allocated with
vmalloc_node.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 arch/Kconfig                        | 29 +++++++++++++
 arch/ia64/include/asm/thread_info.h |  2 +-
 include/linux/sched.h               | 15 +++++++
 kernel/fork.c                       | 81 +++++++++++++++++++++++++++++--------
 4 files changed, 109 insertions(+), 18 deletions(-)

Comments

Josh Poimboeuf June 17, 2016, 8:57 p.m. UTC | #1
On Fri, Jun 17, 2016 at 01:00:42PM -0700, Andy Lutomirski wrote:
> @@ -213,24 +232,46 @@ struct kmem_cache *vm_area_cachep;
>  /* SLAB cache for mm_struct structures (tsk->mm) */
>  static struct kmem_cache *mm_cachep;
>  
> -static void account_kernel_stack(struct thread_info *ti, int account)
> +static void account_kernel_stack(struct task_struct *tsk, int account)
>  {
> -	struct zone *zone = page_zone(virt_to_page(ti));
> +	struct zone *zone;
> +	struct thread_info *ti = task_thread_info(tsk);
> +	struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> +	BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
> +
> +	if (vm) {
> +		int i;
>  
> -	mod_zone_page_state(zone, NR_KERNEL_STACK_KB,
> -			    THREAD_SIZE / 1024 * account);
> +		BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
>  
> -	/* All stack pages belong to the same memcg. */
> -	memcg_kmem_update_page_stat(
> -		virt_to_page(ti), MEMCG_KERNEL_STACK,
> -		account * (THREAD_SIZE / PAGE_SIZE));
> +		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
> +			mod_zone_page_state(page_zone(vm->pages[i]),
> +					    1, PAGE_SIZE / 1024 * account);

Shouldn't the second argument be NR_KERNEL_STACK_KB instead of 1?
Andy Lutomirski June 17, 2016, 10:18 p.m. UTC | #2
On Fri, Jun 17, 2016 at 1:57 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> On Fri, Jun 17, 2016 at 01:00:42PM -0700, Andy Lutomirski wrote:
>> @@ -213,24 +232,46 @@ struct kmem_cache *vm_area_cachep;
>>  /* SLAB cache for mm_struct structures (tsk->mm) */
>>  static struct kmem_cache *mm_cachep;
>>
>> -static void account_kernel_stack(struct thread_info *ti, int account)
>> +static void account_kernel_stack(struct task_struct *tsk, int account)
>>  {
>> -     struct zone *zone = page_zone(virt_to_page(ti));
>> +     struct zone *zone;
>> +     struct thread_info *ti = task_thread_info(tsk);
>> +     struct vm_struct *vm = task_stack_vm_area(tsk);
>> +
>> +     BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
>> +
>> +     if (vm) {
>> +             int i;
>>
>> -     mod_zone_page_state(zone, NR_KERNEL_STACK_KB,
>> -                         THREAD_SIZE / 1024 * account);
>> +             BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
>>
>> -     /* All stack pages belong to the same memcg. */
>> -     memcg_kmem_update_page_stat(
>> -             virt_to_page(ti), MEMCG_KERNEL_STACK,
>> -             account * (THREAD_SIZE / PAGE_SIZE));
>> +             for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
>> +                     mod_zone_page_state(page_zone(vm->pages[i]),
>> +                                         1, PAGE_SIZE / 1024 * account);
>
> Shouldn't the second argument be NR_KERNEL_STACK_KB instead of 1?

Indeed. Queued for v3.

--Andy
Michal Hocko June 20, 2016, 1:36 p.m. UTC | #3
On Fri 17-06-16 13:00:42, Andy Lutomirski wrote:
> If CONFIG_VMAP_STACK is selected, kernel stacks are allocated with
> vmalloc_node.

I like this! It also reduces demand for higher order (order-2) pages
considerably which is a great plus on its own. I would be little bit
worried about the performance because vmalloc wasn't the fastest one
AFAIR. Have you tried to measure that?

From a quick glance the patch looks OK, I will have to look closer
though.

> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> ---
>  arch/Kconfig                        | 29 +++++++++++++
>  arch/ia64/include/asm/thread_info.h |  2 +-
>  include/linux/sched.h               | 15 +++++++
>  kernel/fork.c                       | 81 +++++++++++++++++++++++++++++--------
>  4 files changed, 109 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index d794384a0404..a71e6e7195e6 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -658,4 +658,33 @@ config ARCH_NO_COHERENT_DMA_MMAP
>  config CPU_NO_EFFICIENT_FFS
>  	def_bool n
>  
> +config HAVE_ARCH_VMAP_STACK
> +	def_bool n
> +	help
> +	  An arch should select this symbol if it can support kernel stacks
> +	  in vmalloc space.  This means:
> +
> +	  - vmalloc space must be large enough to hold many kernel stacks.
> +	    This may rule out many 32-bit architectures.
> +
> +	  - Stacks in vmalloc space need to work reliably.  For example, if
> +	    vmap page tables are created on demand, either this mechanism
> +	    needs to work while the stack points to a virtual address with
> +	    unpopulated page tables or arch code (switch_to and switch_mm,
> +	    most likely) needs to ensure that the stack's page table entries
> +	    are populated before running on a possibly unpopulated stack.
> +
> +	  - If the stack overflows into a guard page, something reasonable
> +	    should happen.  The definition of "reasonable" is flexible, but
> +	    instantly rebooting without logging anything would be unfriendly.
> +
> +config VMAP_STACK
> +	bool "Use a virtually-mapped stack"
> +	depends on HAVE_ARCH_VMAP_STACK
> +	---help---
> +	  Enable this if you want the use virtually-mapped kernel stacks
> +	  with guard pages.  This causes kernel stack overflows to be
> +	  caught immediately rather than causing difficult-to-diagnose
> +	  corruption.
> +
>  source "kernel/gcov/Kconfig"
> diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
> index aa995b67c3f5..d13edda6e09c 100644
> --- a/arch/ia64/include/asm/thread_info.h
> +++ b/arch/ia64/include/asm/thread_info.h
> @@ -56,7 +56,7 @@ struct thread_info {
>  #define alloc_thread_info_node(tsk, node)	((struct thread_info *) 0)
>  #define task_thread_info(tsk)	((struct thread_info *) 0)
>  #endif
> -#define free_thread_info(ti)	/* nothing */
> +#define free_thread_info(tsk)	/* nothing */
>  #define task_stack_page(tsk)	((void *)(tsk))
>  
>  #define __HAVE_THREAD_FUNCTIONS
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 6e42ada26345..a37c3b790309 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1918,6 +1918,9 @@ struct task_struct {
>  #ifdef CONFIG_MMU
>  	struct task_struct *oom_reaper_list;
>  #endif
> +#ifdef CONFIG_VMAP_STACK
> +	struct vm_struct *stack_vm_area;
> +#endif
>  /* CPU-specific state of this task */
>  	struct thread_struct thread;
>  /*
> @@ -1934,6 +1937,18 @@ extern int arch_task_struct_size __read_mostly;
>  # define arch_task_struct_size (sizeof(struct task_struct))
>  #endif
>  
> +#ifdef CONFIG_VMAP_STACK
> +static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
> +{
> +	return t->stack_vm_area;
> +}
> +#else
> +static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
> +{
> +	return NULL;
> +}
> +#endif
> +
>  /* Future-safe accessor for struct task_struct's cpus_allowed. */
>  #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
>  
> diff --git a/kernel/fork.c b/kernel/fork.c
> index cd2abe6e4e41..ad77a6b07708 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -158,19 +158,38 @@ void __weak arch_release_thread_info(struct thread_info *ti)
>   * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
>   * kmemcache based allocator.
>   */
> -# if THREAD_SIZE >= PAGE_SIZE
> +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
>  static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
>  						  int node)
>  {
> +#ifdef CONFIG_VMAP_STACK
> +	struct thread_info *ti = __vmalloc_node_range(
> +		THREAD_SIZE, THREAD_SIZE, VMALLOC_START, VMALLOC_END,
> +		THREADINFO_GFP | __GFP_HIGHMEM, PAGE_KERNEL,
> +		0, node, __builtin_return_address(0));
> +
> +	/*
> +	 * We can't call find_vm_area() in interrupt context, and
> +	 * free_thread_info can be called in interrupt context, so cache
> +	 * the vm_struct.
> +	 */
> +	if (ti)
> +		tsk->stack_vm_area = find_vm_area(ti);
> +	return ti;
> +#else
>  	struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
>  						  THREAD_SIZE_ORDER);
>  
>  	return page ? page_address(page) : NULL;
> +#endif
>  }
>  
> -static inline void free_thread_info(struct thread_info *ti)
> +static inline void free_thread_info(struct task_struct *tsk)
>  {
> -	free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
> +	if (task_stack_vm_area(tsk))
> +		vfree(tsk->stack);
> +	else
> +		free_kmem_pages((unsigned long)tsk->stack, THREAD_SIZE_ORDER);
>  }
>  # else
>  static struct kmem_cache *thread_info_cache;
> @@ -181,9 +200,9 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
>  	return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
>  }
>  
> -static void free_thread_info(struct thread_info *ti)
> +static void free_thread_info(struct task_struct *tsk)
>  {
> -	kmem_cache_free(thread_info_cache, ti);
> +	kmem_cache_free(thread_info_cache, tsk->stack);
>  }
>  
>  void thread_info_cache_init(void)
> @@ -213,24 +232,46 @@ struct kmem_cache *vm_area_cachep;
>  /* SLAB cache for mm_struct structures (tsk->mm) */
>  static struct kmem_cache *mm_cachep;
>  
> -static void account_kernel_stack(struct thread_info *ti, int account)
> +static void account_kernel_stack(struct task_struct *tsk, int account)
>  {
> -	struct zone *zone = page_zone(virt_to_page(ti));
> +	struct zone *zone;
> +	struct thread_info *ti = task_thread_info(tsk);
> +	struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> +	BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
> +
> +	if (vm) {
> +		int i;
>  
> -	mod_zone_page_state(zone, NR_KERNEL_STACK_KB,
> -			    THREAD_SIZE / 1024 * account);
> +		BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
>  
> -	/* All stack pages belong to the same memcg. */
> -	memcg_kmem_update_page_stat(
> -		virt_to_page(ti), MEMCG_KERNEL_STACK,
> -		account * (THREAD_SIZE / PAGE_SIZE));
> +		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
> +			mod_zone_page_state(page_zone(vm->pages[i]),
> +					    1, PAGE_SIZE / 1024 * account);
> +		}
> +
> +		/* All stack pages belong to the same memcg. */
> +		memcg_kmem_update_page_stat(
> +			vm->pages[0], MEMCG_KERNEL_STACK,
> +			account * (THREAD_SIZE / PAGE_SIZE));
> +	} else {
> +		zone = page_zone(virt_to_page(ti));
> +
> +		mod_zone_page_state(zone, NR_KERNEL_STACK_KB,
> +				    THREAD_SIZE / 1024 * account);
> +
> +		/* All stack pages belong to the same memcg. */
> +		memcg_kmem_update_page_stat(
> +			virt_to_page(ti), MEMCG_KERNEL_STACK,
> +			account * (THREAD_SIZE / PAGE_SIZE));
> +	}
>  }
>  
>  void free_task(struct task_struct *tsk)
>  {
> -	account_kernel_stack(tsk->stack, -1);
> +	account_kernel_stack(tsk, -1);
>  	arch_release_thread_info(tsk->stack);
> -	free_thread_info(tsk->stack);
> +	free_thread_info(tsk);
>  	rt_mutex_debug_task_free(tsk);
>  	ftrace_graph_exit_task(tsk);
>  	put_seccomp_filter(tsk);
> @@ -342,6 +383,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
>  {
>  	struct task_struct *tsk;
>  	struct thread_info *ti;
> +	struct vm_struct *stack_vm_area;
>  	int err;
>  
>  	if (node == NUMA_NO_NODE)
> @@ -354,11 +396,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
>  	if (!ti)
>  		goto free_tsk;
>  
> +	stack_vm_area = task_stack_vm_area(tsk);
> +
>  	err = arch_dup_task_struct(tsk, orig);
>  	if (err)
>  		goto free_ti;
>  
>  	tsk->stack = ti;
> +#ifdef CONFIG_VMAP_STACK
> +	tsk->stack_vm_area = stack_vm_area;
> +#endif
>  #ifdef CONFIG_SECCOMP
>  	/*
>  	 * We must handle setting up seccomp filters once we're under
> @@ -390,14 +437,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
>  	tsk->task_frag.page = NULL;
>  	tsk->wake_q.next = NULL;
>  
> -	account_kernel_stack(ti, 1);
> +	account_kernel_stack(tsk, 1);
>  
>  	kcov_task_init(tsk);
>  
>  	return tsk;
>  
>  free_ti:
> -	free_thread_info(ti);
> +	free_thread_info(tsk);
>  free_tsk:
>  	free_task_struct(tsk);
>  	return NULL;
> -- 
> 2.5.5
Andy Lutomirski June 20, 2016, 4:13 p.m. UTC | #4
On Mon, Jun 20, 2016 at 6:36 AM, Michal Hocko <mhocko@kernel.org> wrote:
> On Fri 17-06-16 13:00:42, Andy Lutomirski wrote:
>> If CONFIG_VMAP_STACK is selected, kernel stacks are allocated with
>> vmalloc_node.
>
> I like this! It also reduces demand for higher order (order-2) pages
> considerably which is a great plus on its own. I would be little bit
> worried about the performance because vmalloc wasn't the fastest one
> AFAIR. Have you tried to measure that?

It seems to add about 1.5µs to pthread_create+join on my laptop.  (On
an unmodified, stripped-down kernel, it took about 7µs before.  On a
Fedora system, the baseline is much worse.)  I think that most of the
overhead is because vmalloc allocates one page at a time, which means
that it won't use a higher order page even if one is sitting on a
freelist.

I can imagine better integration with the page allocator in which
higher order pages are used if readily available.  Similarly, vfree
could free pages that happen to be aligned and consecutive as a unit
to avoid the overhead of merging them back together one at a time.

But I'm not planning on doing any of this myself any time soon.  I
just want to get the code working and merged.

--Andy
Michal Hocko June 21, 2016, 8:46 a.m. UTC | #5
On Mon 20-06-16 09:13:55, Andy Lutomirski wrote:
> On Mon, Jun 20, 2016 at 6:36 AM, Michal Hocko <mhocko@kernel.org> wrote:
> > On Fri 17-06-16 13:00:42, Andy Lutomirski wrote:
> >> If CONFIG_VMAP_STACK is selected, kernel stacks are allocated with
> >> vmalloc_node.
> >
> > I like this! It also reduces demand for higher order (order-2) pages
> > considerably which is a great plus on its own. I would be little bit
> > worried about the performance because vmalloc wasn't the fastest one
> > AFAIR. Have you tried to measure that?
> 
> It seems to add about 1.5µs to pthread_create+join on my laptop.  (On
> an unmodified, stripped-down kernel, it took about 7µs before.  On a
> Fedora system, the baseline is much worse.)  I think that most of the
> overhead is because vmalloc allocates one page at a time, which means
> that it won't use a higher order page even if one is sitting on a
> freelist.

I guess a less artificial test case which would would generate a lot of
tasks and some memory pressure would be more representative (e.g.
kernbench). The thing is that even order-2 pages might get quite
expensive when the memory is fragmented.

> I can imagine better integration with the page allocator in which
> higher order pages are used if readily available.  Similarly, vfree
> could free pages that happen to be aligned and consecutive as a unit
> to avoid the overhead of merging them back together one at a time.
> 
> But I'm not planning on doing any of this myself any time soon.  I
> just want to get the code working and merged.

I agree, there is a room for improvement but no necessarily as a part of
this series.
Andy Lutomirski June 21, 2016, 5:01 p.m. UTC | #6
On Tue, Jun 21, 2016 at 1:46 AM, Michal Hocko <mhocko@kernel.org> wrote:
> On Mon 20-06-16 09:13:55, Andy Lutomirski wrote:
>> On Mon, Jun 20, 2016 at 6:36 AM, Michal Hocko <mhocko@kernel.org> wrote:
>> > On Fri 17-06-16 13:00:42, Andy Lutomirski wrote:
>> >> If CONFIG_VMAP_STACK is selected, kernel stacks are allocated with
>> >> vmalloc_node.
>> >
>> > I like this! It also reduces demand for higher order (order-2) pages
>> > considerably which is a great plus on its own. I would be little bit
>> > worried about the performance because vmalloc wasn't the fastest one
>> > AFAIR. Have you tried to measure that?
>>
>> It seems to add about 1.5盜 to pthread_create+join on my laptop.  (On
>> an unmodified, stripped-down kernel, it took about 7盜 before.  On a
>> Fedora system, the baseline is much worse.)  I think that most of the
>> overhead is because vmalloc allocates one page at a time, which means
>> that it won't use a higher order page even if one is sitting on a
>> freelist.
>
> I guess a less artificial test case which would would generate a lot of
> tasks and some memory pressure would be more representative (e.g.
> kernbench). The thing is that even order-2 pages might get quite
> expensive when the memory is fragmented.
>
>> I can imagine better integration with the page allocator in which
>> higher order pages are used if readily available.  Similarly, vfree
>> could free pages that happen to be aligned and consecutive as a unit
>> to avoid the overhead of merging them back together one at a time.
>>
>> But I'm not planning on doing any of this myself any time soon.  I
>> just want to get the code working and merged.
>
> I agree, there is a room for improvement but no necessarily as a part of
> this series.
>

Agreed.  My goal is to get this good enough for upstream, and we can
make it even better down the road.

That being said, I think I will implement Linus' suggestion of a tiny
percpu cache.

--Andy
diff mbox

Patch

diff --git a/arch/Kconfig b/arch/Kconfig
index d794384a0404..a71e6e7195e6 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -658,4 +658,33 @@  config ARCH_NO_COHERENT_DMA_MMAP
 config CPU_NO_EFFICIENT_FFS
 	def_bool n
 
+config HAVE_ARCH_VMAP_STACK
+	def_bool n
+	help
+	  An arch should select this symbol if it can support kernel stacks
+	  in vmalloc space.  This means:
+
+	  - vmalloc space must be large enough to hold many kernel stacks.
+	    This may rule out many 32-bit architectures.
+
+	  - Stacks in vmalloc space need to work reliably.  For example, if
+	    vmap page tables are created on demand, either this mechanism
+	    needs to work while the stack points to a virtual address with
+	    unpopulated page tables or arch code (switch_to and switch_mm,
+	    most likely) needs to ensure that the stack's page table entries
+	    are populated before running on a possibly unpopulated stack.
+
+	  - If the stack overflows into a guard page, something reasonable
+	    should happen.  The definition of "reasonable" is flexible, but
+	    instantly rebooting without logging anything would be unfriendly.
+
+config VMAP_STACK
+	bool "Use a virtually-mapped stack"
+	depends on HAVE_ARCH_VMAP_STACK
+	---help---
+	  Enable this if you want the use virtually-mapped kernel stacks
+	  with guard pages.  This causes kernel stack overflows to be
+	  caught immediately rather than causing difficult-to-diagnose
+	  corruption.
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index aa995b67c3f5..d13edda6e09c 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h
@@ -56,7 +56,7 @@  struct thread_info {
 #define alloc_thread_info_node(tsk, node)	((struct thread_info *) 0)
 #define task_thread_info(tsk)	((struct thread_info *) 0)
 #endif
-#define free_thread_info(ti)	/* nothing */
+#define free_thread_info(tsk)	/* nothing */
 #define task_stack_page(tsk)	((void *)(tsk))
 
 #define __HAVE_THREAD_FUNCTIONS
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e42ada26345..a37c3b790309 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1918,6 +1918,9 @@  struct task_struct {
 #ifdef CONFIG_MMU
 	struct task_struct *oom_reaper_list;
 #endif
+#ifdef CONFIG_VMAP_STACK
+	struct vm_struct *stack_vm_area;
+#endif
 /* CPU-specific state of this task */
 	struct thread_struct thread;
 /*
@@ -1934,6 +1937,18 @@  extern int arch_task_struct_size __read_mostly;
 # define arch_task_struct_size (sizeof(struct task_struct))
 #endif
 
+#ifdef CONFIG_VMAP_STACK
+static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
+{
+	return t->stack_vm_area;
+}
+#else
+static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
+{
+	return NULL;
+}
+#endif
+
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
diff --git a/kernel/fork.c b/kernel/fork.c
index cd2abe6e4e41..ad77a6b07708 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -158,19 +158,38 @@  void __weak arch_release_thread_info(struct thread_info *ti)
  * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
  * kmemcache based allocator.
  */
-# if THREAD_SIZE >= PAGE_SIZE
+# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
 static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 						  int node)
 {
+#ifdef CONFIG_VMAP_STACK
+	struct thread_info *ti = __vmalloc_node_range(
+		THREAD_SIZE, THREAD_SIZE, VMALLOC_START, VMALLOC_END,
+		THREADINFO_GFP | __GFP_HIGHMEM, PAGE_KERNEL,
+		0, node, __builtin_return_address(0));
+
+	/*
+	 * We can't call find_vm_area() in interrupt context, and
+	 * free_thread_info can be called in interrupt context, so cache
+	 * the vm_struct.
+	 */
+	if (ti)
+		tsk->stack_vm_area = find_vm_area(ti);
+	return ti;
+#else
 	struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
 						  THREAD_SIZE_ORDER);
 
 	return page ? page_address(page) : NULL;
+#endif
 }
 
-static inline void free_thread_info(struct thread_info *ti)
+static inline void free_thread_info(struct task_struct *tsk)
 {
-	free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+	if (task_stack_vm_area(tsk))
+		vfree(tsk->stack);
+	else
+		free_kmem_pages((unsigned long)tsk->stack, THREAD_SIZE_ORDER);
 }
 # else
 static struct kmem_cache *thread_info_cache;
@@ -181,9 +200,9 @@  static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 	return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
 }
 
-static void free_thread_info(struct thread_info *ti)
+static void free_thread_info(struct task_struct *tsk)
 {
-	kmem_cache_free(thread_info_cache, ti);
+	kmem_cache_free(thread_info_cache, tsk->stack);
 }
 
 void thread_info_cache_init(void)
@@ -213,24 +232,46 @@  struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
-static void account_kernel_stack(struct thread_info *ti, int account)
+static void account_kernel_stack(struct task_struct *tsk, int account)
 {
-	struct zone *zone = page_zone(virt_to_page(ti));
+	struct zone *zone;
+	struct thread_info *ti = task_thread_info(tsk);
+	struct vm_struct *vm = task_stack_vm_area(tsk);
+
+	BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+
+	if (vm) {
+		int i;
 
-	mod_zone_page_state(zone, NR_KERNEL_STACK_KB,
-			    THREAD_SIZE / 1024 * account);
+		BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
 
-	/* All stack pages belong to the same memcg. */
-	memcg_kmem_update_page_stat(
-		virt_to_page(ti), MEMCG_KERNEL_STACK,
-		account * (THREAD_SIZE / PAGE_SIZE));
+		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+			mod_zone_page_state(page_zone(vm->pages[i]),
+					    1, PAGE_SIZE / 1024 * account);
+		}
+
+		/* All stack pages belong to the same memcg. */
+		memcg_kmem_update_page_stat(
+			vm->pages[0], MEMCG_KERNEL_STACK,
+			account * (THREAD_SIZE / PAGE_SIZE));
+	} else {
+		zone = page_zone(virt_to_page(ti));
+
+		mod_zone_page_state(zone, NR_KERNEL_STACK_KB,
+				    THREAD_SIZE / 1024 * account);
+
+		/* All stack pages belong to the same memcg. */
+		memcg_kmem_update_page_stat(
+			virt_to_page(ti), MEMCG_KERNEL_STACK,
+			account * (THREAD_SIZE / PAGE_SIZE));
+	}
 }
 
 void free_task(struct task_struct *tsk)
 {
-	account_kernel_stack(tsk->stack, -1);
+	account_kernel_stack(tsk, -1);
 	arch_release_thread_info(tsk->stack);
-	free_thread_info(tsk->stack);
+	free_thread_info(tsk);
 	rt_mutex_debug_task_free(tsk);
 	ftrace_graph_exit_task(tsk);
 	put_seccomp_filter(tsk);
@@ -342,6 +383,7 @@  static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
 	struct task_struct *tsk;
 	struct thread_info *ti;
+	struct vm_struct *stack_vm_area;
 	int err;
 
 	if (node == NUMA_NO_NODE)
@@ -354,11 +396,16 @@  static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	if (!ti)
 		goto free_tsk;
 
+	stack_vm_area = task_stack_vm_area(tsk);
+
 	err = arch_dup_task_struct(tsk, orig);
 	if (err)
 		goto free_ti;
 
 	tsk->stack = ti;
+#ifdef CONFIG_VMAP_STACK
+	tsk->stack_vm_area = stack_vm_area;
+#endif
 #ifdef CONFIG_SECCOMP
 	/*
 	 * We must handle setting up seccomp filters once we're under
@@ -390,14 +437,14 @@  static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->task_frag.page = NULL;
 	tsk->wake_q.next = NULL;
 
-	account_kernel_stack(ti, 1);
+	account_kernel_stack(tsk, 1);
 
 	kcov_task_init(tsk);
 
 	return tsk;
 
 free_ti:
-	free_thread_info(ti);
+	free_thread_info(tsk);
 free_tsk:
 	free_task_struct(tsk);
 	return NULL;