[v4,26/29] sched: Allow putting thread_info into task_struct
diff mbox

Message ID aeea9c1c41e889065b2c9ecd6d0694e4dfb529c8.1466974736.git.luto@kernel.org
State New
Headers show

Commit Message

Andy Lutomirski June 26, 2016, 9:55 p.m. UTC
If an arch opts in by setting CONFIG_THREAD_INFO_IN_TASK_STRUCT,
then thread_info is defined as a single 'u32 flags' and is the first
entry of task_struct.  thread_info::task is removed (it serves no
purpose if thread_info is embedded in task_struct), and
thread_info::cpu gets its own slot in task_struct.

This is heavily based on a patch written by Linus.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 include/linux/init_task.h   |  9 +++++++++
 include/linux/sched.h       | 36 ++++++++++++++++++++++++++++++++++--
 include/linux/thread_info.h | 15 +++++++++++++++
 init/Kconfig                |  3 +++
 init/init_task.c            |  7 +++++--
 kernel/sched/sched.h        |  4 ++++
 6 files changed, 70 insertions(+), 4 deletions(-)

Comments

Mark Rutland July 11, 2016, 10:08 a.m. UTC | #1
Hi,

On Sun, Jun 26, 2016 at 02:55:48PM -0700, Andy Lutomirski wrote:
> If an arch opts in by setting CONFIG_THREAD_INFO_IN_TASK_STRUCT,
> then thread_info is defined as a single 'u32 flags' and is the first
> entry of task_struct.  thread_info::task is removed (it serves no
> purpose if thread_info is embedded in task_struct), and
> thread_info::cpu gets its own slot in task_struct.
> 
> This is heavily based on a patch written by Linus.

I've been considering how we'd implement this for arm64, and I suspect
that we'll also need to fold our preempt_count into task_struct
(following from the style of asm-generic/preempt.h).

As far as I can see, we can't make our preempt-count a percpu variable
as with x86, as our percpu ops themselves are based on disabling
preemption.

To that end, would it be possible to keep the thread_info definition per
arch, even with CONFIG_THREAD_INFO_IN_TASK?

Thanks,
Mark.

> 
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> ---
>  include/linux/init_task.h   |  9 +++++++++
>  include/linux/sched.h       | 36 ++++++++++++++++++++++++++++++++++--
>  include/linux/thread_info.h | 15 +++++++++++++++
>  init/Kconfig                |  3 +++
>  init/init_task.c            |  7 +++++--
>  kernel/sched/sched.h        |  4 ++++
>  6 files changed, 70 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index f8834f820ec2..9c04d44eeb3c 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -15,6 +15,8 @@
>  #include <net/net_namespace.h>
>  #include <linux/sched/rt.h>
>  
> +#include <asm/thread_info.h>
> +
>  #ifdef CONFIG_SMP
>  # define INIT_PUSHABLE_TASKS(tsk)					\
>  	.pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO),
> @@ -183,12 +185,19 @@ extern struct task_group root_task_group;
>  # define INIT_KASAN(tsk)
>  #endif
>  
> +#ifdef CONFIG_THREAD_INFO_IN_TASK
> +# define INIT_TASK_TI(tsk) .thread_info = INIT_THREAD_INFO(tsk),
> +#else
> +# define INIT_TASK_TI(tsk)
> +#endif
> +
>  /*
>   *  INIT_TASK is used to set up the first task table, touch at
>   * your own risk!. Base=0, limit=0x1fffff (=2MB)
>   */
>  #define INIT_TASK(tsk)	\
>  {									\
> +	INIT_TASK_TI(tsk)						\
>  	.state		= 0,						\
>  	.stack		= init_stack,					\
>  	.usage		= ATOMIC_INIT(2),				\
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 569df670407a..4108b4880b86 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1456,6 +1456,13 @@ struct tlbflush_unmap_batch {
>  };
>  
>  struct task_struct {
> +#ifdef CONFIG_THREAD_INFO_IN_TASK
> +	/*
> +	 * For reasons of header soup (see current_thread_info()), this
> +	 * must be the first element of task_struct.
> +	 */
> +	struct thread_info thread_info;
> +#endif
>  	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
>  	void *stack;
>  	atomic_t usage;
> @@ -1465,6 +1472,9 @@ struct task_struct {
>  #ifdef CONFIG_SMP
>  	struct llist_node wake_entry;
>  	int on_cpu;
> +#ifdef CONFIG_THREAD_INFO_IN_TASK
> +	unsigned int cpu;	/* current CPU */
> +#endif
>  	unsigned int wakee_flips;
>  	unsigned long wakee_flip_decay_ts;
>  	struct task_struct *last_wakee;
> @@ -2557,7 +2567,9 @@ extern void set_curr_task(int cpu, struct task_struct *p);
>  void yield(void);
>  
>  union thread_union {
> +#ifndef CONFIG_THREAD_INFO_IN_TASK
>  	struct thread_info thread_info;
> +#endif
>  	unsigned long stack[THREAD_SIZE/sizeof(long)];
>  };
>  
> @@ -3045,10 +3057,26 @@ static inline void threadgroup_change_end(struct task_struct *tsk)
>  	cgroup_threadgroup_change_end(tsk);
>  }
>  
> -#ifndef __HAVE_THREAD_FUNCTIONS
> +#ifdef CONFIG_THREAD_INFO_IN_TASK
> +
> +static inline struct thread_info *task_thread_info(struct task_struct *task)
> +{
> +	return &task->thread_info;
> +}
> +static inline void *task_stack_page(const struct task_struct *task)
> +{
> +	return task->stack;
> +}
> +#define setup_thread_stack(new,old)	do { } while(0)
> +static inline unsigned long *end_of_stack(const struct task_struct *task)
> +{
> +	return task->stack;
> +}
> +
> +#elif !defined(__HAVE_THREAD_FUNCTIONS)
>  
>  #define task_thread_info(task)	((struct thread_info *)(task)->stack)
> -#define task_stack_page(task)	((task)->stack)
> +#define task_stack_page(task)	((void *)(task)->stack)
>  
>  static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
>  {
> @@ -3348,7 +3376,11 @@ static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
>  
>  static inline unsigned int task_cpu(const struct task_struct *p)
>  {
> +#ifdef CONFIG_THREAD_INFO_IN_TASK
> +	return p->cpu;
> +#else
>  	return task_thread_info(p)->cpu;
> +#endif
>  }
>  
>  static inline int task_node(const struct task_struct *p)
> diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
> index 352b1542f5cc..b2b32d63bc8e 100644
> --- a/include/linux/thread_info.h
> +++ b/include/linux/thread_info.h
> @@ -13,6 +13,21 @@
>  struct timespec;
>  struct compat_timespec;
>  
> +#ifdef CONFIG_THREAD_INFO_IN_TASK
> +struct thread_info {
> +	u32			flags;		/* low level flags */
> +};
> +
> +#define INIT_THREAD_INFO(tsk)			\
> +{						\
> +	.flags		= 0,			\
> +}
> +#endif
> +
> +#ifdef CONFIG_THREAD_INFO_IN_TASK
> +#define current_thread_info() ((struct thread_info *)current)
> +#endif
> +
>  /*
>   * System call restart block.
>   */
> diff --git a/init/Kconfig b/init/Kconfig
> index f755a602d4a1..0c83af6d3753 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -26,6 +26,9 @@ config IRQ_WORK
>  config BUILDTIME_EXTABLE_SORT
>  	bool
>  
> +config THREAD_INFO_IN_TASK
> +	bool
> +
>  menu "General setup"
>  
>  config BROKEN
> diff --git a/init/init_task.c b/init/init_task.c
> index ba0a7f362d9e..11f83be1fa79 100644
> --- a/init/init_task.c
> +++ b/init/init_task.c
> @@ -22,5 +22,8 @@ EXPORT_SYMBOL(init_task);
>   * Initial thread structure. Alignment of this is handled by a special
>   * linker map entry.
>   */
> -union thread_union init_thread_union __init_task_data =
> -	{ INIT_THREAD_INFO(init_task) };
> +union thread_union init_thread_union __init_task_data = {
> +#ifndef CONFIG_THREAD_INFO_IN_TASK
> +	INIT_THREAD_INFO(init_task)
> +#endif
> +};
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 7cbeb92a1cb9..a1cabcea4c54 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -999,7 +999,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
>  	 * per-task data have been completed by this moment.
>  	 */
>  	smp_wmb();
> +#ifdef CONFIG_THREAD_INFO_IN_TASK
> +	p->cpu = cpu;
> +#else
>  	task_thread_info(p)->cpu = cpu;
> +#endif
>  	p->wake_cpu = cpu;
>  #endif
>  }
> -- 
> 2.7.4
>
Andy Lutomirski July 11, 2016, 2:55 p.m. UTC | #2
On Jul 11, 2016 3:08 AM, "Mark Rutland" <mark.rutland@arm.com> wrote:
>
> Hi,
>
> On Sun, Jun 26, 2016 at 02:55:48PM -0700, Andy Lutomirski wrote:
> > If an arch opts in by setting CONFIG_THREAD_INFO_IN_TASK_STRUCT,
> > then thread_info is defined as a single 'u32 flags' and is the first
> > entry of task_struct.  thread_info::task is removed (it serves no
> > purpose if thread_info is embedded in task_struct), and
> > thread_info::cpu gets its own slot in task_struct.
> >
> > This is heavily based on a patch written by Linus.
>
> I've been considering how we'd implement this for arm64, and I suspect
> that we'll also need to fold our preempt_count into task_struct
> (following from the style of asm-generic/preempt.h).
>
> As far as I can see, we can't make our preempt-count a percpu variable
> as with x86, as our percpu ops themselves are based on disabling
> preemption.

How do you intend to find 'current' to get to the preempt count
without first disabling preemption?

>
> To that end, would it be possible to keep the thread_info definition per
> arch, even with CONFIG_THREAD_INFO_IN_TASK?

In principal, yes, but could you alternatively put it in
thread_struct?  My goal here is to encourage people to clean up their
use of thread_info vs thread_struct at the same time.  For x86, that
cleanup was trivial -- most of the work was addressing relative to
current instead of the stack pointer, and that had to happen
regardless.

--Andy
Mark Rutland July 11, 2016, 3:08 p.m. UTC | #3
On Mon, Jul 11, 2016 at 07:55:17AM -0700, Andy Lutomirski wrote:
> On Jul 11, 2016 3:08 AM, "Mark Rutland" <mark.rutland@arm.com> wrote:
> >
> > Hi,
> >
> > On Sun, Jun 26, 2016 at 02:55:48PM -0700, Andy Lutomirski wrote:
> > > If an arch opts in by setting CONFIG_THREAD_INFO_IN_TASK_STRUCT,
> > > then thread_info is defined as a single 'u32 flags' and is the first
> > > entry of task_struct.  thread_info::task is removed (it serves no
> > > purpose if thread_info is embedded in task_struct), and
> > > thread_info::cpu gets its own slot in task_struct.
> > >
> > > This is heavily based on a patch written by Linus.
> >
> > I've been considering how we'd implement this for arm64, and I suspect
> > that we'll also need to fold our preempt_count into task_struct
> > (following from the style of asm-generic/preempt.h).
> >
> > As far as I can see, we can't make our preempt-count a percpu variable
> > as with x86, as our percpu ops themselves are based on disabling
> > preemption.
> 
> How do you intend to find 'current' to get to the preempt count
> without first disabling preemption?

Good point.

For some reason I had convinced myself that it only mattered for RMW
sequences, so evidently I hadn't considered things thoroughly enough. :(

> > To that end, would it be possible to keep the thread_info definition per
> > arch, even with CONFIG_THREAD_INFO_IN_TASK?
> 
> In principal, yes, but could you alternatively put it in
> thread_struct?  My goal here is to encourage people to clean up their
> use of thread_info vs thread_struct at the same time.  For x86, that
> cleanup was trivial -- most of the work was addressing relative to
> current instead of the stack pointer, and that had to happen
> regardless.

I'm more than happy to do that, modulo the above permitting.

Sorry for the noise!

Thanks,
Mark.
Linus Torvalds July 11, 2016, 4:06 p.m. UTC | #4
On Jul 11, 2016 7:55 AM, "Andy Lutomirski" <luto@amacapital.net> wrote:
>
> How do you intend to find 'current' to get to the preempt count
> without first disabling preemption?

Actually, that is the classic case of "not a problem".

The thing is, it doesn't matter if you schedule away while looking up
current or the preempt count - because both values are idempotent wet
scheduling.

So until you do the wire that actually disables preemption you can schedule
away as much as you want, and after that write you no longer will.

This is different wrt a per-cpu area - which is clearly not idempotent wrt
scheduling.

The reason per-cpu works on x86 is that we have an atomic rmw operation
that is *also* atomic wrt the CPU lookup (thanks to the segment base)

     Linus
Mark Rutland July 11, 2016, 4:31 p.m. UTC | #5
On Mon, Jul 11, 2016 at 09:06:58AM -0700, Linus Torvalds wrote:
> On Jul 11, 2016 7:55 AM, "Andy Lutomirski" <[1]luto@amacapital.net> wrote:
> >
> > How do you intend to find 'current' to get to the preempt count
> > without first disabling preemption?
>
> Actually, that is the classic case of "not a problem".
>
> The thing is, it doesn't matter if you schedule away while looking up
> current or the preempt count - because both values are idempotent wet
> scheduling.
>
> So until you do the wire that actually disables preemption you can
> schedule away as much as you want, and after that write you no longer
> will.

I was assuming a percpu pointer to current (or preempt count).

The percpu offset might be stale at the point you try to dereference
that, even though current itself hasn't changed, and you may access the
wrong CPU's value.

> This is different wrt a per-cpu area - which is clearly not idempotent wrt
> scheduling.
>
> The reason per-cpu works on x86 is that we have an atomic rmw operation
> that is *also* atomic wrt the CPU lookup (thanks to the segment base)

Sure, understood.

Mark.
Linus Torvalds July 11, 2016, 4:42 p.m. UTC | #6
On Mon, Jul 11, 2016 at 9:31 AM, Mark Rutland <mark.rutland@arm.com> wrote:
>>
>> So until you do the wire that actually disables preemption you can
>> schedule away as much as you want, and after that write you no longer
>> will.
>
> I was assuming a percpu pointer to current (or preempt count).

So for the same reason that is ok *iff* you have

 - some kind of dedicated percpu register (or other base pointer - x86
has the segment thing) that gets updated when you schedule.

 - an instruction that can load 'current' directly off that register atomically.

But yes, percpu data in general is obviously not safe to access
without preemption.

         Linus

Patch
diff mbox

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index f8834f820ec2..9c04d44eeb3c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -15,6 +15,8 @@ 
 #include <net/net_namespace.h>
 #include <linux/sched/rt.h>
 
+#include <asm/thread_info.h>
+
 #ifdef CONFIG_SMP
 # define INIT_PUSHABLE_TASKS(tsk)					\
 	.pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO),
@@ -183,12 +185,19 @@  extern struct task_group root_task_group;
 # define INIT_KASAN(tsk)
 #endif
 
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+# define INIT_TASK_TI(tsk) .thread_info = INIT_THREAD_INFO(tsk),
+#else
+# define INIT_TASK_TI(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
  */
 #define INIT_TASK(tsk)	\
 {									\
+	INIT_TASK_TI(tsk)						\
 	.state		= 0,						\
 	.stack		= init_stack,					\
 	.usage		= ATOMIC_INIT(2),				\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 569df670407a..4108b4880b86 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1456,6 +1456,13 @@  struct tlbflush_unmap_batch {
 };
 
 struct task_struct {
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	/*
+	 * For reasons of header soup (see current_thread_info()), this
+	 * must be the first element of task_struct.
+	 */
+	struct thread_info thread_info;
+#endif
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
 	atomic_t usage;
@@ -1465,6 +1472,9 @@  struct task_struct {
 #ifdef CONFIG_SMP
 	struct llist_node wake_entry;
 	int on_cpu;
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	unsigned int cpu;	/* current CPU */
+#endif
 	unsigned int wakee_flips;
 	unsigned long wakee_flip_decay_ts;
 	struct task_struct *last_wakee;
@@ -2557,7 +2567,9 @@  extern void set_curr_task(int cpu, struct task_struct *p);
 void yield(void);
 
 union thread_union {
+#ifndef CONFIG_THREAD_INFO_IN_TASK
 	struct thread_info thread_info;
+#endif
 	unsigned long stack[THREAD_SIZE/sizeof(long)];
 };
 
@@ -3045,10 +3057,26 @@  static inline void threadgroup_change_end(struct task_struct *tsk)
 	cgroup_threadgroup_change_end(tsk);
 }
 
-#ifndef __HAVE_THREAD_FUNCTIONS
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+
+static inline struct thread_info *task_thread_info(struct task_struct *task)
+{
+	return &task->thread_info;
+}
+static inline void *task_stack_page(const struct task_struct *task)
+{
+	return task->stack;
+}
+#define setup_thread_stack(new,old)	do { } while(0)
+static inline unsigned long *end_of_stack(const struct task_struct *task)
+{
+	return task->stack;
+}
+
+#elif !defined(__HAVE_THREAD_FUNCTIONS)
 
 #define task_thread_info(task)	((struct thread_info *)(task)->stack)
-#define task_stack_page(task)	((task)->stack)
+#define task_stack_page(task)	((void *)(task)->stack)
 
 static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
 {
@@ -3348,7 +3376,11 @@  static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
 
 static inline unsigned int task_cpu(const struct task_struct *p)
 {
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	return p->cpu;
+#else
 	return task_thread_info(p)->cpu;
+#endif
 }
 
 static inline int task_node(const struct task_struct *p)
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 352b1542f5cc..b2b32d63bc8e 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -13,6 +13,21 @@ 
 struct timespec;
 struct compat_timespec;
 
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+struct thread_info {
+	u32			flags;		/* low level flags */
+};
+
+#define INIT_THREAD_INFO(tsk)			\
+{						\
+	.flags		= 0,			\
+}
+#endif
+
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+#define current_thread_info() ((struct thread_info *)current)
+#endif
+
 /*
  * System call restart block.
  */
diff --git a/init/Kconfig b/init/Kconfig
index f755a602d4a1..0c83af6d3753 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -26,6 +26,9 @@  config IRQ_WORK
 config BUILDTIME_EXTABLE_SORT
 	bool
 
+config THREAD_INFO_IN_TASK
+	bool
+
 menu "General setup"
 
 config BROKEN
diff --git a/init/init_task.c b/init/init_task.c
index ba0a7f362d9e..11f83be1fa79 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -22,5 +22,8 @@  EXPORT_SYMBOL(init_task);
  * Initial thread structure. Alignment of this is handled by a special
  * linker map entry.
  */
-union thread_union init_thread_union __init_task_data =
-	{ INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data = {
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+	INIT_THREAD_INFO(init_task)
+#endif
+};
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7cbeb92a1cb9..a1cabcea4c54 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -999,7 +999,11 @@  static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 	 * per-task data have been completed by this moment.
 	 */
 	smp_wmb();
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+	p->cpu = cpu;
+#else
 	task_thread_info(p)->cpu = cpu;
+#endif
 	p->wake_cpu = cpu;
 #endif
 }