diff mbox series

[RFC,01/10] sched: core: save IRQ state during locking

Message ID 20210223023428.757694-2-volodymyr_babchuk@epam.com (mailing list archive)
State New
Headers show
Series Preemption in hypervisor (ARM only) | expand

Commit Message

Volodymyr Babchuk Feb. 23, 2021, 2:34 a.m. UTC
With XEN preemption enabled, scheduler functions can be called with
IRQs disabled (for example, at end of IRQ handler), so we should
save and restore IRQ state in schedulers code.

Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>
---
 xen/common/sched/core.c | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

Comments

Juergen Gross Feb. 23, 2021, 8:52 a.m. UTC | #1
On 23.02.21 03:34, Volodymyr Babchuk wrote:
> With XEN preemption enabled, scheduler functions can be called with
> IRQs disabled (for example, at end of IRQ handler), so we should
> save and restore IRQ state in schedulers code.

This breaks core scheduling.

Waiting for another sibling with interrupts disabled is an absolute
no go, as deadlocks are the consequence.

You could (in theory) make preemption and core scheduling mutually
exclusive, but this would break the forward path to mutexes etc.


Juergen

> 
> Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>
> ---
>   xen/common/sched/core.c | 33 ++++++++++++++++++---------------
>   1 file changed, 18 insertions(+), 15 deletions(-)
> 
> diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
> index 9745a77eee..7e075613d5 100644
> --- a/xen/common/sched/core.c
> +++ b/xen/common/sched/core.c
> @@ -2470,7 +2470,8 @@ static struct vcpu *sched_force_context_switch(struct vcpu *vprev,
>    * sched_res_rculock has been dropped.
>    */
>   static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
> -                                                   spinlock_t **lock, int cpu,
> +                                                   spinlock_t **lock,
> +                                                   unsigned long *flags, int cpu,
>                                                      s_time_t now)
>   {
>       struct sched_unit *next;
> @@ -2500,7 +2501,7 @@ static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
>                   prev->rendezvous_in_cnt++;
>                   atomic_set(&prev->rendezvous_out_cnt, 0);
>   
> -                pcpu_schedule_unlock_irq(*lock, cpu);
> +                pcpu_schedule_unlock_irqrestore(*lock, *flags, cpu);
>   
>                   sched_context_switch(vprev, v, false, now);
>   
> @@ -2530,7 +2531,7 @@ static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
>               prev->rendezvous_in_cnt++;
>               atomic_set(&prev->rendezvous_out_cnt, 0);
>   
> -            pcpu_schedule_unlock_irq(*lock, cpu);
> +            pcpu_schedule_unlock_irqrestore(*lock, *flags, cpu);
>   
>               raise_softirq(SCHED_SLAVE_SOFTIRQ);
>               sched_context_switch(vprev, vprev, false, now);
> @@ -2538,11 +2539,11 @@ static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
>               return NULL;         /* ARM only. */
>           }
>   
> -        pcpu_schedule_unlock_irq(*lock, cpu);
> +        pcpu_schedule_unlock_irqrestore(*lock, *flags, cpu);
>   
>           cpu_relax();
>   
> -        *lock = pcpu_schedule_lock_irq(cpu);
> +        *lock = pcpu_schedule_lock_irqsave(cpu, flags);
>   
>           /*
>            * Check for scheduling resource switched. This happens when we are
> @@ -2557,7 +2558,7 @@ static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
>               ASSERT(is_idle_unit(prev));
>               atomic_set(&prev->next_task->rendezvous_out_cnt, 0);
>               prev->rendezvous_in_cnt = 0;
> -            pcpu_schedule_unlock_irq(*lock, cpu);
> +            pcpu_schedule_unlock_irqrestore(*lock, *flags, cpu);
>               rcu_read_unlock(&sched_res_rculock);
>               return NULL;
>           }
> @@ -2574,12 +2575,13 @@ static void sched_slave(void)
>       spinlock_t           *lock;
>       bool                  do_softirq = false;
>       unsigned int          cpu = smp_processor_id();
> +    unsigned long         flags;
>   
>       ASSERT_NOT_IN_ATOMIC();
>   
>       rcu_read_lock(&sched_res_rculock);
>   
> -    lock = pcpu_schedule_lock_irq(cpu);
> +    lock = pcpu_schedule_lock_irqsave(cpu, &flags);
>   
>       now = NOW();
>   
> @@ -2590,7 +2592,7 @@ static void sched_slave(void)
>   
>           if ( v )
>           {
> -            pcpu_schedule_unlock_irq(lock, cpu);
> +            pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
>   
>               sched_context_switch(vprev, v, false, now);
>   
> @@ -2602,7 +2604,7 @@ static void sched_slave(void)
>   
>       if ( !prev->rendezvous_in_cnt )
>       {
> -        pcpu_schedule_unlock_irq(lock, cpu);
> +        pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
>   
>           rcu_read_unlock(&sched_res_rculock);
>   
> @@ -2615,11 +2617,11 @@ static void sched_slave(void)
>   
>       stop_timer(&get_sched_res(cpu)->s_timer);
>   
> -    next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
> +    next = sched_wait_rendezvous_in(prev, &lock, &flags, cpu, now);
>       if ( !next )
>           return;
>   
> -    pcpu_schedule_unlock_irq(lock, cpu);
> +    pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
>   
>       sched_context_switch(vprev, sched_unit2vcpu_cpu(next, cpu),
>                            is_idle_unit(next) && !is_idle_unit(prev), now);
> @@ -2637,6 +2639,7 @@ static void schedule(void)
>       s_time_t              now;
>       struct sched_resource *sr;
>       spinlock_t           *lock;
> +    unsigned long         flags;
>       int cpu = smp_processor_id();
>       unsigned int          gran;
>   
> @@ -2646,7 +2649,7 @@ static void schedule(void)
>   
>       rcu_read_lock(&sched_res_rculock);
>   
> -    lock = pcpu_schedule_lock_irq(cpu);
> +    lock = pcpu_schedule_lock_irqsave(cpu, &flags);
>   
>       sr = get_sched_res(cpu);
>       gran = sr->granularity;
> @@ -2657,7 +2660,7 @@ static void schedule(void)
>            * We have a race: sched_slave() should be called, so raise a softirq
>            * in order to re-enter schedule() later and call sched_slave() now.
>            */
> -        pcpu_schedule_unlock_irq(lock, cpu);
> +        pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
>   
>           rcu_read_unlock(&sched_res_rculock);
>   
> @@ -2676,7 +2679,7 @@ static void schedule(void)
>           prev->rendezvous_in_cnt = gran;
>           cpumask_andnot(mask, sr->cpus, cpumask_of(cpu));
>           cpumask_raise_softirq(mask, SCHED_SLAVE_SOFTIRQ);
> -        next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
> +        next = sched_wait_rendezvous_in(prev, &lock, &flags, cpu, now);
>           if ( !next )
>               return;
>       }
> @@ -2687,7 +2690,7 @@ static void schedule(void)
>           atomic_set(&next->rendezvous_out_cnt, 0);
>       }
>   
> -    pcpu_schedule_unlock_irq(lock, cpu);
> +    pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
>   
>       vnext = sched_unit2vcpu_cpu(next, cpu);
>       sched_context_switch(vprev, vnext,
>
Volodymyr Babchuk Feb. 23, 2021, 11:15 a.m. UTC | #2
Hi Jurgen,

Jürgen Groß writes:

> On 23.02.21 03:34, Volodymyr Babchuk wrote:
>> With XEN preemption enabled, scheduler functions can be called with
>> IRQs disabled (for example, at end of IRQ handler), so we should
>> save and restore IRQ state in schedulers code.
>
> This breaks core scheduling.

Yes, thank you. I forgot to mention that this PoC is not compatible with
core scheduling. It is not used on ARM, so I could not test it anyways.

> Waiting for another sibling with interrupts disabled is an absolute
> no go, as deadlocks are the consequence.
>
> You could (in theory) make preemption and core scheduling mutually
> exclusive, but this would break the forward path to mutexes etc.
>

Well, I implemented the most naive way to enable hypervisor
preemption. I'm sure that with a bit more careful approach I can make it
compatible with core scheduling. There is no strict requirement to run
scheduler with IRQs disabled.

>
> Juergen
>
>> Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>
>> ---
>>   xen/common/sched/core.c | 33 ++++++++++++++++++---------------
>>   1 file changed, 18 insertions(+), 15 deletions(-)
>> diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
>> index 9745a77eee..7e075613d5 100644
>> --- a/xen/common/sched/core.c
>> +++ b/xen/common/sched/core.c
>> @@ -2470,7 +2470,8 @@ static struct vcpu *sched_force_context_switch(struct vcpu *vprev,
>>    * sched_res_rculock has been dropped.
>>    */
>>   static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
>> -                                                   spinlock_t **lock, int cpu,
>> +                                                   spinlock_t **lock,
>> +                                                   unsigned long *flags, int cpu,
>>                                                      s_time_t now)
>>   {
>>       struct sched_unit *next;
>> @@ -2500,7 +2501,7 @@ static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
>>                   prev->rendezvous_in_cnt++;
>>                   atomic_set(&prev->rendezvous_out_cnt, 0);
>>   -                pcpu_schedule_unlock_irq(*lock, cpu);
>> +                pcpu_schedule_unlock_irqrestore(*lock, *flags, cpu);
>>                     sched_context_switch(vprev, v, false, now);
>>   @@ -2530,7 +2531,7 @@ static struct sched_unit
>> *sched_wait_rendezvous_in(struct sched_unit *prev,
>>               prev->rendezvous_in_cnt++;
>>               atomic_set(&prev->rendezvous_out_cnt, 0);
>>   -            pcpu_schedule_unlock_irq(*lock, cpu);
>> +            pcpu_schedule_unlock_irqrestore(*lock, *flags, cpu);
>>                 raise_softirq(SCHED_SLAVE_SOFTIRQ);
>>               sched_context_switch(vprev, vprev, false, now);
>> @@ -2538,11 +2539,11 @@ static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
>>               return NULL;         /* ARM only. */
>>           }
>>   -        pcpu_schedule_unlock_irq(*lock, cpu);
>> +        pcpu_schedule_unlock_irqrestore(*lock, *flags, cpu);
>>             cpu_relax();
>>   -        *lock = pcpu_schedule_lock_irq(cpu);
>> +        *lock = pcpu_schedule_lock_irqsave(cpu, flags);
>>             /*
>>            * Check for scheduling resource switched. This happens when we are
>> @@ -2557,7 +2558,7 @@ static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
>>               ASSERT(is_idle_unit(prev));
>>               atomic_set(&prev->next_task->rendezvous_out_cnt, 0);
>>               prev->rendezvous_in_cnt = 0;
>> -            pcpu_schedule_unlock_irq(*lock, cpu);
>> +            pcpu_schedule_unlock_irqrestore(*lock, *flags, cpu);
>>               rcu_read_unlock(&sched_res_rculock);
>>               return NULL;
>>           }
>> @@ -2574,12 +2575,13 @@ static void sched_slave(void)
>>       spinlock_t           *lock;
>>       bool                  do_softirq = false;
>>       unsigned int          cpu = smp_processor_id();
>> +    unsigned long         flags;
>>         ASSERT_NOT_IN_ATOMIC();
>>         rcu_read_lock(&sched_res_rculock);
>>   -    lock = pcpu_schedule_lock_irq(cpu);
>> +    lock = pcpu_schedule_lock_irqsave(cpu, &flags);
>>         now = NOW();
>>   @@ -2590,7 +2592,7 @@ static void sched_slave(void)
>>             if ( v )
>>           {
>> -            pcpu_schedule_unlock_irq(lock, cpu);
>> +            pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
>>                 sched_context_switch(vprev, v, false, now);
>>   @@ -2602,7 +2604,7 @@ static void sched_slave(void)
>>         if ( !prev->rendezvous_in_cnt )
>>       {
>> -        pcpu_schedule_unlock_irq(lock, cpu);
>> +        pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
>>             rcu_read_unlock(&sched_res_rculock);
>>   @@ -2615,11 +2617,11 @@ static void sched_slave(void)
>>         stop_timer(&get_sched_res(cpu)->s_timer);
>>   -    next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
>> +    next = sched_wait_rendezvous_in(prev, &lock, &flags, cpu, now);
>>       if ( !next )
>>           return;
>>   -    pcpu_schedule_unlock_irq(lock, cpu);
>> +    pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
>>         sched_context_switch(vprev, sched_unit2vcpu_cpu(next, cpu),
>>                            is_idle_unit(next) && !is_idle_unit(prev), now);
>> @@ -2637,6 +2639,7 @@ static void schedule(void)
>>       s_time_t              now;
>>       struct sched_resource *sr;
>>       spinlock_t           *lock;
>> +    unsigned long         flags;
>>       int cpu = smp_processor_id();
>>       unsigned int          gran;
>>   @@ -2646,7 +2649,7 @@ static void schedule(void)
>>         rcu_read_lock(&sched_res_rculock);
>>   -    lock = pcpu_schedule_lock_irq(cpu);
>> +    lock = pcpu_schedule_lock_irqsave(cpu, &flags);
>>         sr = get_sched_res(cpu);
>>       gran = sr->granularity;
>> @@ -2657,7 +2660,7 @@ static void schedule(void)
>>            * We have a race: sched_slave() should be called, so raise a softirq
>>            * in order to re-enter schedule() later and call sched_slave() now.
>>            */
>> -        pcpu_schedule_unlock_irq(lock, cpu);
>> +        pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
>>             rcu_read_unlock(&sched_res_rculock);
>>   @@ -2676,7 +2679,7 @@ static void schedule(void)
>>           prev->rendezvous_in_cnt = gran;
>>           cpumask_andnot(mask, sr->cpus, cpumask_of(cpu));
>>           cpumask_raise_softirq(mask, SCHED_SLAVE_SOFTIRQ);
>> -        next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
>> +        next = sched_wait_rendezvous_in(prev, &lock, &flags, cpu, now);
>>           if ( !next )
>>               return;
>>       }
>> @@ -2687,7 +2690,7 @@ static void schedule(void)
>>           atomic_set(&next->rendezvous_out_cnt, 0);
>>       }
>>   -    pcpu_schedule_unlock_irq(lock, cpu);
>> +    pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
>>         vnext = sched_unit2vcpu_cpu(next, cpu);
>>       sched_context_switch(vprev, vnext,
>>
Andrew Cooper Feb. 24, 2021, 6:29 p.m. UTC | #3
On 23/02/2021 02:34, Volodymyr Babchuk wrote:
> With XEN preemption enabled, scheduler functions can be called with
> IRQs disabled (for example, at end of IRQ handler), so we should
> save and restore IRQ state in schedulers code.
>
> Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>

These functions need to fixed not to be _irq() variants in the first place.

The only reason they're like that is so the schedule softirq/irq can
edit the runqueues.  I seem to recall Dario having a plan to switch the
runqueues to using a lockless update mechanism, which IIRC removes any
need for any of the scheduler locks to be irqs-off.

~Andrew
diff mbox series

Patch

diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
index 9745a77eee..7e075613d5 100644
--- a/xen/common/sched/core.c
+++ b/xen/common/sched/core.c
@@ -2470,7 +2470,8 @@  static struct vcpu *sched_force_context_switch(struct vcpu *vprev,
  * sched_res_rculock has been dropped.
  */
 static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
-                                                   spinlock_t **lock, int cpu,
+                                                   spinlock_t **lock,
+                                                   unsigned long *flags, int cpu,
                                                    s_time_t now)
 {
     struct sched_unit *next;
@@ -2500,7 +2501,7 @@  static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
                 prev->rendezvous_in_cnt++;
                 atomic_set(&prev->rendezvous_out_cnt, 0);
 
-                pcpu_schedule_unlock_irq(*lock, cpu);
+                pcpu_schedule_unlock_irqrestore(*lock, *flags, cpu);
 
                 sched_context_switch(vprev, v, false, now);
 
@@ -2530,7 +2531,7 @@  static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
             prev->rendezvous_in_cnt++;
             atomic_set(&prev->rendezvous_out_cnt, 0);
 
-            pcpu_schedule_unlock_irq(*lock, cpu);
+            pcpu_schedule_unlock_irqrestore(*lock, *flags, cpu);
 
             raise_softirq(SCHED_SLAVE_SOFTIRQ);
             sched_context_switch(vprev, vprev, false, now);
@@ -2538,11 +2539,11 @@  static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
             return NULL;         /* ARM only. */
         }
 
-        pcpu_schedule_unlock_irq(*lock, cpu);
+        pcpu_schedule_unlock_irqrestore(*lock, *flags, cpu);
 
         cpu_relax();
 
-        *lock = pcpu_schedule_lock_irq(cpu);
+        *lock = pcpu_schedule_lock_irqsave(cpu, flags);
 
         /*
          * Check for scheduling resource switched. This happens when we are
@@ -2557,7 +2558,7 @@  static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
             ASSERT(is_idle_unit(prev));
             atomic_set(&prev->next_task->rendezvous_out_cnt, 0);
             prev->rendezvous_in_cnt = 0;
-            pcpu_schedule_unlock_irq(*lock, cpu);
+            pcpu_schedule_unlock_irqrestore(*lock, *flags, cpu);
             rcu_read_unlock(&sched_res_rculock);
             return NULL;
         }
@@ -2574,12 +2575,13 @@  static void sched_slave(void)
     spinlock_t           *lock;
     bool                  do_softirq = false;
     unsigned int          cpu = smp_processor_id();
+    unsigned long         flags;
 
     ASSERT_NOT_IN_ATOMIC();
 
     rcu_read_lock(&sched_res_rculock);
 
-    lock = pcpu_schedule_lock_irq(cpu);
+    lock = pcpu_schedule_lock_irqsave(cpu, &flags);
 
     now = NOW();
 
@@ -2590,7 +2592,7 @@  static void sched_slave(void)
 
         if ( v )
         {
-            pcpu_schedule_unlock_irq(lock, cpu);
+            pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
 
             sched_context_switch(vprev, v, false, now);
 
@@ -2602,7 +2604,7 @@  static void sched_slave(void)
 
     if ( !prev->rendezvous_in_cnt )
     {
-        pcpu_schedule_unlock_irq(lock, cpu);
+        pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
 
         rcu_read_unlock(&sched_res_rculock);
 
@@ -2615,11 +2617,11 @@  static void sched_slave(void)
 
     stop_timer(&get_sched_res(cpu)->s_timer);
 
-    next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
+    next = sched_wait_rendezvous_in(prev, &lock, &flags, cpu, now);
     if ( !next )
         return;
 
-    pcpu_schedule_unlock_irq(lock, cpu);
+    pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
 
     sched_context_switch(vprev, sched_unit2vcpu_cpu(next, cpu),
                          is_idle_unit(next) && !is_idle_unit(prev), now);
@@ -2637,6 +2639,7 @@  static void schedule(void)
     s_time_t              now;
     struct sched_resource *sr;
     spinlock_t           *lock;
+    unsigned long         flags;
     int cpu = smp_processor_id();
     unsigned int          gran;
 
@@ -2646,7 +2649,7 @@  static void schedule(void)
 
     rcu_read_lock(&sched_res_rculock);
 
-    lock = pcpu_schedule_lock_irq(cpu);
+    lock = pcpu_schedule_lock_irqsave(cpu, &flags);
 
     sr = get_sched_res(cpu);
     gran = sr->granularity;
@@ -2657,7 +2660,7 @@  static void schedule(void)
          * We have a race: sched_slave() should be called, so raise a softirq
          * in order to re-enter schedule() later and call sched_slave() now.
          */
-        pcpu_schedule_unlock_irq(lock, cpu);
+        pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
 
         rcu_read_unlock(&sched_res_rculock);
 
@@ -2676,7 +2679,7 @@  static void schedule(void)
         prev->rendezvous_in_cnt = gran;
         cpumask_andnot(mask, sr->cpus, cpumask_of(cpu));
         cpumask_raise_softirq(mask, SCHED_SLAVE_SOFTIRQ);
-        next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
+        next = sched_wait_rendezvous_in(prev, &lock, &flags, cpu, now);
         if ( !next )
             return;
     }
@@ -2687,7 +2690,7 @@  static void schedule(void)
         atomic_set(&next->rendezvous_out_cnt, 0);
     }
 
-    pcpu_schedule_unlock_irq(lock, cpu);
+    pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
 
     vnext = sched_unit2vcpu_cpu(next, cpu);
     sched_context_switch(vprev, vnext,