diff mbox series

[2/4] kernel.h: Add non_block_start/end()

Message ID 20181210103641.31259-3-daniel.vetter@ffwll.ch (mailing list archive)
State New, archived
Headers show
Series mmu notifier debug checks v2 | expand

Commit Message

Daniel Vetter Dec. 10, 2018, 10:36 a.m. UTC
In some special cases we must not block, but there's not a
spinlock, preempt-off, irqs-off or similar critical section already
that arms the might_sleep() debug checks. Add a non_block_start/end()
pair to annotate these.

This will be used in the oom paths of mmu-notifiers, where blocking is
not allowed to make sure there's forward progress.

Suggested by Michal Hocko.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Christian König" <christian.koenig@amd.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: linux-mm@kvack.org
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
---
 include/linux/kernel.h | 10 +++++++++-
 include/linux/sched.h  |  4 ++++
 kernel/sched/core.c    |  6 +++---
 3 files changed, 16 insertions(+), 4 deletions(-)

Comments

Michal Hocko Dec. 10, 2018, 2:13 p.m. UTC | #1
I do not see any scheduler guys Cced and it would be really great to get
their opinion here.

On Mon 10-12-18 11:36:39, Daniel Vetter wrote:
> In some special cases we must not block, but there's not a
> spinlock, preempt-off, irqs-off or similar critical section already
> that arms the might_sleep() debug checks. Add a non_block_start/end()
> pair to annotate these.
> 
> This will be used in the oom paths of mmu-notifiers, where blocking is
> not allowed to make sure there's forward progress.

Considering the only alternative would be to abuse
preempt_{disable,enable}, and that really has a different semantic, I
think this makes some sense. The cotext is preemptible but we do not
want notifier to sleep on any locks, WQ etc.

> Suggested by Michal Hocko.
> 
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: David Rientjes <rientjes@google.com>
> Cc: "Christian König" <christian.koenig@amd.com>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: "Jérôme Glisse" <jglisse@redhat.com>
> Cc: linux-mm@kvack.org
> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
> ---
>  include/linux/kernel.h | 10 +++++++++-
>  include/linux/sched.h  |  4 ++++
>  kernel/sched/core.c    |  6 +++---
>  3 files changed, 16 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index d6aac75b51ba..c2cf31515b3d 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -251,7 +251,9 @@ extern int _cond_resched(void);
>   * might_sleep - annotation for functions that can sleep
>   *
>   * this macro will print a stack trace if it is executed in an atomic
> - * context (spinlock, irq-handler, ...).
> + * context (spinlock, irq-handler, ...). Additional sections where blocking is
> + * not allowed can be annotated with non_block_start() and non_block_end()
> + * pairs.
>   *
>   * This is a useful debugging help to be able to catch problems early and not
>   * be bitten later when the calling function happens to sleep when it is not
> @@ -260,6 +262,10 @@ extern int _cond_resched(void);
>  # define might_sleep() \
>  	do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
>  # define sched_annotate_sleep()	(current->task_state_change = 0)
> +# define non_block_start() \
> +	do { current->non_block_count++; } while (0)
> +# define non_block_end() \
> +	do { WARN_ON(current->non_block_count-- == 0); } while (0)
>  #else
>    static inline void ___might_sleep(const char *file, int line,
>  				   int preempt_offset) { }
> @@ -267,6 +273,8 @@ extern int _cond_resched(void);
>  				   int preempt_offset) { }
>  # define might_sleep() do { might_resched(); } while (0)
>  # define sched_annotate_sleep() do { } while (0)
> +# define non_block_start() do { } while (0)
> +# define non_block_end() do { } while (0)
>  #endif
>  
>  #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index ecffd4e37453..41249dbf8f27 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -916,6 +916,10 @@ struct task_struct {
>  	struct mutex_waiter		*blocked_on;
>  #endif
>  
> +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
> +	int				non_block_count;
> +#endif
> +
>  #ifdef CONFIG_TRACE_IRQFLAGS
>  	unsigned int			irq_events;
>  	unsigned long			hardirq_enable_ip;
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 6fedf3a98581..969d7a71f30c 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6113,7 +6113,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
>  	rcu_sleep_check();
>  
>  	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
> -	     !is_idle_task(current)) ||
> +	     !is_idle_task(current) && !current->non_block_count) ||
>  	    system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
>  	    oops_in_progress)
>  		return;
> @@ -6129,8 +6129,8 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
>  		"BUG: sleeping function called from invalid context at %s:%d\n",
>  			file, line);
>  	printk(KERN_ERR
> -		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
> -			in_atomic(), irqs_disabled(),
> +		"in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
> +			in_atomic(), irqs_disabled(), current->non_block_count,
>  			current->pid, current->comm);
>  
>  	if (task_stack_end_corrupted(current))
> -- 
> 2.20.0.rc1
>
Peter Zijlstra Dec. 10, 2018, 2:47 p.m. UTC | #2
On Mon, Dec 10, 2018 at 03:13:37PM +0100, Michal Hocko wrote:
> I do not see any scheduler guys Cced and it would be really great to get
> their opinion here.
> 
> On Mon 10-12-18 11:36:39, Daniel Vetter wrote:
> > In some special cases we must not block, but there's not a
> > spinlock, preempt-off, irqs-off or similar critical section already
> > that arms the might_sleep() debug checks. Add a non_block_start/end()
> > pair to annotate these.
> > 
> > This will be used in the oom paths of mmu-notifiers, where blocking is
> > not allowed to make sure there's forward progress.
> 
> Considering the only alternative would be to abuse
> preempt_{disable,enable}, and that really has a different semantic, I
> think this makes some sense. The cotext is preemptible but we do not
> want notifier to sleep on any locks, WQ etc.

I'm confused... what is this supposed to do?

And what does 'block' mean here? Without preempt_disable/IRQ-off we're
subject to regular preemption and execution can stall for arbitrary
amounts of time.

The Changelog doesn't yield any clues.
Michal Hocko Dec. 10, 2018, 3:01 p.m. UTC | #3
On Mon 10-12-18 15:47:11, Peter Zijlstra wrote:
> On Mon, Dec 10, 2018 at 03:13:37PM +0100, Michal Hocko wrote:
> > I do not see any scheduler guys Cced and it would be really great to get
> > their opinion here.
> > 
> > On Mon 10-12-18 11:36:39, Daniel Vetter wrote:
> > > In some special cases we must not block, but there's not a
> > > spinlock, preempt-off, irqs-off or similar critical section already
> > > that arms the might_sleep() debug checks. Add a non_block_start/end()
> > > pair to annotate these.
> > > 
> > > This will be used in the oom paths of mmu-notifiers, where blocking is
> > > not allowed to make sure there's forward progress.
> > 
> > Considering the only alternative would be to abuse
> > preempt_{disable,enable}, and that really has a different semantic, I
> > think this makes some sense. The cotext is preemptible but we do not
> > want notifier to sleep on any locks, WQ etc.
> 
> I'm confused... what is this supposed to do?
> 
> And what does 'block' mean here? Without preempt_disable/IRQ-off we're
> subject to regular preemption and execution can stall for arbitrary
> amounts of time.

The notifier is called from quite a restricted context - oom_reaper - 
which shouldn't depend on any locks or sleepable conditionals. The code
should be swift as well but we mostly do care about it to make a forward
progress. Checking for sleepable context is the best thing we could come
up with that would describe these demands at least partially.
Peter Zijlstra Dec. 10, 2018, 3:22 p.m. UTC | #4
On Mon, Dec 10, 2018 at 04:01:59PM +0100, Michal Hocko wrote:
> On Mon 10-12-18 15:47:11, Peter Zijlstra wrote:
> > On Mon, Dec 10, 2018 at 03:13:37PM +0100, Michal Hocko wrote:
> > > I do not see any scheduler guys Cced and it would be really great to get
> > > their opinion here.
> > > 
> > > On Mon 10-12-18 11:36:39, Daniel Vetter wrote:
> > > > In some special cases we must not block, but there's not a
> > > > spinlock, preempt-off, irqs-off or similar critical section already
> > > > that arms the might_sleep() debug checks. Add a non_block_start/end()
> > > > pair to annotate these.
> > > > 
> > > > This will be used in the oom paths of mmu-notifiers, where blocking is
> > > > not allowed to make sure there's forward progress.
> > > 
> > > Considering the only alternative would be to abuse
> > > preempt_{disable,enable}, and that really has a different semantic, I
> > > think this makes some sense. The cotext is preemptible but we do not
> > > want notifier to sleep on any locks, WQ etc.
> > 
> > I'm confused... what is this supposed to do?
> > 
> > And what does 'block' mean here? Without preempt_disable/IRQ-off we're
> > subject to regular preemption and execution can stall for arbitrary
> > amounts of time.
> 
> The notifier is called from quite a restricted context - oom_reaper - 
> which shouldn't depend on any locks or sleepable conditionals. 

You want to exclude spinlocks too? We could maybe frob something with
lockdep if you need that?

> The code
> should be swift as well but we mostly do care about it to make a forward
> progress. Checking for sleepable context is the best thing we could come
> up with that would describe these demands at least partially.

OK, no real objections to the thing.  Just so long we're all on the same
page as to what it does and doesn't do ;-)

I suppose you could extend the check to include schedule_debug() as
well, maybe something like:

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f66920173370..b1aaa278f1af 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3278,13 +3278,18 @@ static noinline void __schedule_bug(struct task_struct *prev)
 /*
  * Various schedule()-time debugging checks and statistics:
  */
-static inline void schedule_debug(struct task_struct *prev)
+static inline void schedule_debug(struct task_struct *prev, bool preempt)
 {
 #ifdef CONFIG_SCHED_STACK_END_CHECK
 	if (task_stack_end_corrupted(prev))
 		panic("corrupted stack end detected inside scheduler\n");
 #endif
 
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+	if (!preempt && prev->state && prev->non_block_count)
+		// splat
+#endif
+
 	if (unlikely(in_atomic_preempt_off())) {
 		__schedule_bug(prev);
 		preempt_count_set(PREEMPT_DISABLED);
@@ -3391,7 +3396,7 @@ static void __sched notrace __schedule(bool preempt)
 	rq = cpu_rq(cpu);
 	prev = rq->curr;
 
-	schedule_debug(prev);
+	schedule_debug(prev, preempt);
 
 	if (sched_feat(HRTICK))
 		hrtick_clear(rq);
Michal Hocko Dec. 10, 2018, 4:20 p.m. UTC | #5
On Mon 10-12-18 16:22:53, Peter Zijlstra wrote:
> On Mon, Dec 10, 2018 at 04:01:59PM +0100, Michal Hocko wrote:
> > On Mon 10-12-18 15:47:11, Peter Zijlstra wrote:
> > > On Mon, Dec 10, 2018 at 03:13:37PM +0100, Michal Hocko wrote:
> > > > I do not see any scheduler guys Cced and it would be really great to get
> > > > their opinion here.
> > > > 
> > > > On Mon 10-12-18 11:36:39, Daniel Vetter wrote:
> > > > > In some special cases we must not block, but there's not a
> > > > > spinlock, preempt-off, irqs-off or similar critical section already
> > > > > that arms the might_sleep() debug checks. Add a non_block_start/end()
> > > > > pair to annotate these.
> > > > > 
> > > > > This will be used in the oom paths of mmu-notifiers, where blocking is
> > > > > not allowed to make sure there's forward progress.
> > > > 
> > > > Considering the only alternative would be to abuse
> > > > preempt_{disable,enable}, and that really has a different semantic, I
> > > > think this makes some sense. The cotext is preemptible but we do not
> > > > want notifier to sleep on any locks, WQ etc.
> > > 
> > > I'm confused... what is this supposed to do?
> > > 
> > > And what does 'block' mean here? Without preempt_disable/IRQ-off we're
> > > subject to regular preemption and execution can stall for arbitrary
> > > amounts of time.
> > 
> > The notifier is called from quite a restricted context - oom_reaper - 
> > which shouldn't depend on any locks or sleepable conditionals. 
> 
> You want to exclude spinlocks too? We could maybe frob something with
> lockdep if you need that?

Spinlocks are less of a problem because you cannot have a (in)direct
dependency on the page allocator that would deadlock. Spinlocks, or
preemption disabled in general should be short enough to guarantee a
forward progress.

> > The code
> > should be swift as well but we mostly do care about it to make a forward
> > progress. Checking for sleepable context is the best thing we could come
> > up with that would describe these demands at least partially.
> 
> OK, no real objections to the thing.  Just so long we're all on the same
> page as to what it does and doesn't do ;-)

I am not really sure whether there are other potential users besides
this one and whether the check as such is justified.

> I suppose you could extend the check to include schedule_debug() as
> well, maybe something like:

Do you mean to make the check cheaper?

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index f66920173370..b1aaa278f1af 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3278,13 +3278,18 @@ static noinline void __schedule_bug(struct task_struct *prev)
>  /*
>   * Various schedule()-time debugging checks and statistics:
>   */
> -static inline void schedule_debug(struct task_struct *prev)
> +static inline void schedule_debug(struct task_struct *prev, bool preempt)
>  {
>  #ifdef CONFIG_SCHED_STACK_END_CHECK
>  	if (task_stack_end_corrupted(prev))
>  		panic("corrupted stack end detected inside scheduler\n");
>  #endif
>  
> +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
> +	if (!preempt && prev->state && prev->non_block_count)
> +		// splat
> +#endif
> +
>  	if (unlikely(in_atomic_preempt_off())) {
>  		__schedule_bug(prev);
>  		preempt_count_set(PREEMPT_DISABLED);
> @@ -3391,7 +3396,7 @@ static void __sched notrace __schedule(bool preempt)
>  	rq = cpu_rq(cpu);
>  	prev = rq->curr;
>  
> -	schedule_debug(prev);
> +	schedule_debug(prev, preempt);
>  
>  	if (sched_feat(HRTICK))
>  		hrtick_clear(rq);
Peter Zijlstra Dec. 10, 2018, 4:30 p.m. UTC | #6
On Mon, Dec 10, 2018 at 05:20:10PM +0100, Michal Hocko wrote:
> > OK, no real objections to the thing.  Just so long we're all on the same
> > page as to what it does and doesn't do ;-)
> 
> I am not really sure whether there are other potential users besides
> this one and whether the check as such is justified.

It's a debug option...

> > I suppose you could extend the check to include schedule_debug() as
> > well, maybe something like:
> 
> Do you mean to make the check cheaper?

Nah, so the patch only touched might_sleep(), the below touches
schedule().

If there were a patch that hits schedule() without going through a
might_sleep() (rare in practise I think, but entirely possible) then you
won't get a splat without something like the below on top.

> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index f66920173370..b1aaa278f1af 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -3278,13 +3278,18 @@ static noinline void __schedule_bug(struct task_struct *prev)
> >  /*
> >   * Various schedule()-time debugging checks and statistics:
> >   */
> > -static inline void schedule_debug(struct task_struct *prev)
> > +static inline void schedule_debug(struct task_struct *prev, bool preempt)
> >  {
> >  #ifdef CONFIG_SCHED_STACK_END_CHECK
> >  	if (task_stack_end_corrupted(prev))
> >  		panic("corrupted stack end detected inside scheduler\n");
> >  #endif
> >  
> > +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
> > +	if (!preempt && prev->state && prev->non_block_count)
> > +		// splat
> > +#endif
> > +
> >  	if (unlikely(in_atomic_preempt_off())) {
> >  		__schedule_bug(prev);
> >  		preempt_count_set(PREEMPT_DISABLED);
> > @@ -3391,7 +3396,7 @@ static void __sched notrace __schedule(bool preempt)
> >  	rq = cpu_rq(cpu);
> >  	prev = rq->curr;
> >  
> > -	schedule_debug(prev);
> > +	schedule_debug(prev, preempt);
> >  
> >  	if (sched_feat(HRTICK))
> >  		hrtick_clear(rq);
> 
> -- 
> Michal Hocko
> SUSE Labs
Daniel Vetter Dec. 12, 2018, 10:26 a.m. UTC | #7
On Mon, Dec 10, 2018 at 05:30:09PM +0100, Peter Zijlstra wrote:
> On Mon, Dec 10, 2018 at 05:20:10PM +0100, Michal Hocko wrote:
> > > OK, no real objections to the thing.  Just so long we're all on the same
> > > page as to what it does and doesn't do ;-)
> > 
> > I am not really sure whether there are other potential users besides
> > this one and whether the check as such is justified.
> 
> It's a debug option...
> 
> > > I suppose you could extend the check to include schedule_debug() as
> > > well, maybe something like:
> > 
> > Do you mean to make the check cheaper?
> 
> Nah, so the patch only touched might_sleep(), the below touches
> schedule().
> 
> If there were a patch that hits schedule() without going through a
> might_sleep() (rare in practise I think, but entirely possible) then you
> won't get a splat without something like the below on top.

We have a bunch of schedule() calls in i915, for e.g. waiting for multiple
events at the same time (when we want to unblock if any of them fire). And
there's no might_sleep in these cases afaict. Adding the check in
schedule() sounds useful, I'll include your snippet in v2. Plus try a bit
better to explain in the commit message why Michal suggested these.

Thanks, Daniel

> 
> > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > > index f66920173370..b1aaa278f1af 100644
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -3278,13 +3278,18 @@ static noinline void __schedule_bug(struct task_struct *prev)
> > >  /*
> > >   * Various schedule()-time debugging checks and statistics:
> > >   */
> > > -static inline void schedule_debug(struct task_struct *prev)
> > > +static inline void schedule_debug(struct task_struct *prev, bool preempt)
> > >  {
> > >  #ifdef CONFIG_SCHED_STACK_END_CHECK
> > >  	if (task_stack_end_corrupted(prev))
> > >  		panic("corrupted stack end detected inside scheduler\n");
> > >  #endif
> > >  
> > > +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
> > > +	if (!preempt && prev->state && prev->non_block_count)
> > > +		// splat
> > > +#endif
> > > +
> > >  	if (unlikely(in_atomic_preempt_off())) {
> > >  		__schedule_bug(prev);
> > >  		preempt_count_set(PREEMPT_DISABLED);
> > > @@ -3391,7 +3396,7 @@ static void __sched notrace __schedule(bool preempt)
> > >  	rq = cpu_rq(cpu);
> > >  	prev = rq->curr;
> > >  
> > > -	schedule_debug(prev);
> > > +	schedule_debug(prev, preempt);
> > >  
> > >  	if (sched_feat(HRTICK))
> > >  		hrtick_clear(rq);
> > 
> > -- 
> > Michal Hocko
> > SUSE Labs
diff mbox series

Patch

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d6aac75b51ba..c2cf31515b3d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -251,7 +251,9 @@  extern int _cond_resched(void);
  * might_sleep - annotation for functions that can sleep
  *
  * this macro will print a stack trace if it is executed in an atomic
- * context (spinlock, irq-handler, ...).
+ * context (spinlock, irq-handler, ...). Additional sections where blocking is
+ * not allowed can be annotated with non_block_start() and non_block_end()
+ * pairs.
  *
  * This is a useful debugging help to be able to catch problems early and not
  * be bitten later when the calling function happens to sleep when it is not
@@ -260,6 +262,10 @@  extern int _cond_resched(void);
 # define might_sleep() \
 	do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
 # define sched_annotate_sleep()	(current->task_state_change = 0)
+# define non_block_start() \
+	do { current->non_block_count++; } while (0)
+# define non_block_end() \
+	do { WARN_ON(current->non_block_count-- == 0); } while (0)
 #else
   static inline void ___might_sleep(const char *file, int line,
 				   int preempt_offset) { }
@@ -267,6 +273,8 @@  extern int _cond_resched(void);
 				   int preempt_offset) { }
 # define might_sleep() do { might_resched(); } while (0)
 # define sched_annotate_sleep() do { } while (0)
+# define non_block_start() do { } while (0)
+# define non_block_end() do { } while (0)
 #endif
 
 #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ecffd4e37453..41249dbf8f27 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -916,6 +916,10 @@  struct task_struct {
 	struct mutex_waiter		*blocked_on;
 #endif
 
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+	int				non_block_count;
+#endif
+
 #ifdef CONFIG_TRACE_IRQFLAGS
 	unsigned int			irq_events;
 	unsigned long			hardirq_enable_ip;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6fedf3a98581..969d7a71f30c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6113,7 +6113,7 @@  void ___might_sleep(const char *file, int line, int preempt_offset)
 	rcu_sleep_check();
 
 	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
-	     !is_idle_task(current)) ||
+	     !is_idle_task(current) && !current->non_block_count) ||
 	    system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
 	    oops_in_progress)
 		return;
@@ -6129,8 +6129,8 @@  void ___might_sleep(const char *file, int line, int preempt_offset)
 		"BUG: sleeping function called from invalid context at %s:%d\n",
 			file, line);
 	printk(KERN_ERR
-		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
-			in_atomic(), irqs_disabled(),
+		"in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
+			in_atomic(), irqs_disabled(), current->non_block_count,
 			current->pid, current->comm);
 
 	if (task_stack_end_corrupted(current))