diff mbox series

[v3,2/2] xen: merge temporary vcpu pinning scenarios

Message ID 20190724112658.31495-3-jgross@suse.com (mailing list archive)
State New, archived
Headers show
Series xen: enhance temporary vcpu pinning | expand

Commit Message

Juergen Gross July 24, 2019, 11:26 a.m. UTC
Today there are two scenarios which are pinning vcpus temporarily to
a single physical cpu:

- wait_event() handling
- SCHEDOP_pin_override handling

Each of those cases are handled independently today using their own
temporary cpumask to save the old affinity settings.

The two cases can be combined as the first case will only pin a vcpu to
the physical cpu it is already running on, while SCHEDOP_pin_override is
allowed to fail.

So merge the two temporary pinning scenarios by only using one cpumask
and a per-vcpu bitmask for specifying which of the scenarios is
currently active (they are both allowed to be active for the same vcpu).

Note that we don't need to call domain_update_node_affinity() as we
are only pinning for a brief period of time.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
V2:
- removed the NMI/MCE case
- rename vcpu_set_tmp_affinity() (Jan Beulich)
- remove vcpu_pin_override() wrapper (Andrew Cooper)
- current -> curr (Jan Beulich, Andrew Cooper)
- make cpu parameter unsigned int (Jan Beulich)
- add comment (Dario Faggioli)

V3:
- reject SCHEDOP_pin_override with NR_CPUS (Jan Beulich)
---
 xen/common/domain.c     |  1 +
 xen/common/domctl.c     |  2 +-
 xen/common/schedule.c   | 50 +++++++++++++++++++++++++++++++++++--------------
 xen/common/wait.c       | 30 ++++++++++-------------------
 xen/include/xen/sched.h |  8 +++++---
 5 files changed, 53 insertions(+), 38 deletions(-)

Comments

Dario Faggioli July 25, 2019, 9:23 a.m. UTC | #1
On Wed, 2019-07-24 at 13:26 +0200, Juergen Gross wrote:
> Today there are two scenarios which are pinning vcpus temporarily to
> a single physical cpu:
> 
> - wait_event() handling
> - SCHEDOP_pin_override handling
> 
> Each of those cases are handled independently today using their own
> temporary cpumask to save the old affinity settings.
> 
> The two cases can be combined as the first case will only pin a vcpu
> to
> the physical cpu it is already running on, while SCHEDOP_pin_override
> is
> allowed to fail.
> 
> So merge the two temporary pinning scenarios by only using one
> cpumask
> and a per-vcpu bitmask for specifying which of the scenarios is
> currently active (they are both allowed to be active for the same
> vcpu).
> 
> Note that we don't need to call domain_update_node_affinity() as we
> are only pinning for a brief period of time.
> 
> Signed-off-by: Juergen Gross <jgross@suse.com>
> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
>
Reviewed-by: Dario Faggioli <dfaggioli@suse.com>

Regards
Andrew Cooper July 26, 2019, 9:46 a.m. UTC | #2
On 24/07/2019 12:26, Juergen Gross wrote:
> diff --git a/xen/common/wait.c b/xen/common/wait.c
> index 4f830a14e8..3fc5f68611 100644
> --- a/xen/common/wait.c
> +++ b/xen/common/wait.c
> @@ -34,8 +34,6 @@ struct waitqueue_vcpu {
>       */
>      void *esp;
>      char *stack;
> -    cpumask_t saved_affinity;
> -    unsigned int wakeup_cpu;
>  #endif
>  };
>  
> @@ -131,12 +129,10 @@ static void __prepare_to_wait(struct waitqueue_vcpu *wqv)
>      ASSERT(wqv->esp == 0);
>  
>      /* Save current VCPU affinity; force wakeup on *this* CPU only. */
> -    wqv->wakeup_cpu = smp_processor_id();
> -    cpumask_copy(&wqv->saved_affinity, curr->cpu_hard_affinity);
> -    if ( vcpu_set_hard_affinity(curr, cpumask_of(wqv->wakeup_cpu)) )
> +    if ( vcpu_temporary_affinity(curr, smp_processor_id(), VCPU_AFFINITY_WAIT) )
>      {
>          gdprintk(XENLOG_ERR, "Unable to set vcpu affinity\n");
> -        domain_crash(current->domain);
> +        domain_crash(curr->domain);
>  
>          for ( ; ; )
>              do_softirq();
> @@ -170,7 +166,7 @@ static void __prepare_to_wait(struct waitqueue_vcpu *wqv)
>      if ( unlikely(wqv->esp == 0) )
>      {
>          gdprintk(XENLOG_ERR, "Stack too large in %s\n", __func__);
> -        domain_crash(current->domain);
> +        domain_crash(curr->domain);
>  
>          for ( ; ; )
>              do_softirq();
> @@ -182,30 +178,24 @@ static void __prepare_to_wait(struct waitqueue_vcpu *wqv)
>  static void __finish_wait(struct waitqueue_vcpu *wqv)
>  {
>      wqv->esp = NULL;
> -    (void)vcpu_set_hard_affinity(current, &wqv->saved_affinity);
> +    vcpu_temporary_affinity(current, NR_CPUS, VCPU_AFFINITY_WAIT);
>  }
>  
>  void check_wakeup_from_wait(void)
>  {
> -    struct waitqueue_vcpu *wqv = current->waitqueue_vcpu;
> +    struct vcpu *curr = current;
> +    struct waitqueue_vcpu *wqv = curr->waitqueue_vcpu;
>  
>      ASSERT(list_empty(&wqv->list));
>  
>      if ( likely(wqv->esp == NULL) )
>          return;
>  
> -    /* Check if we woke up on the wrong CPU. */
> -    if ( unlikely(smp_processor_id() != wqv->wakeup_cpu) )
> +    /* Check if we are still pinned. */
> +    if ( unlikely(!(curr->affinity_broken & VCPU_AFFINITY_WAIT)) )
>      {
> -        /* Re-set VCPU affinity and re-enter the scheduler. */
> -        struct vcpu *curr = current;
> -        cpumask_copy(&wqv->saved_affinity, curr->cpu_hard_affinity);
> -        if ( vcpu_set_hard_affinity(curr, cpumask_of(wqv->wakeup_cpu)) )
> -        {
> -            gdprintk(XENLOG_ERR, "Unable to set vcpu affinity\n");
> -            domain_crash(current->domain);
> -        }
> -        wait(); /* takes us back into the scheduler */
> +        gdprintk(XENLOG_ERR, "vcpu affinity lost\n");
> +        domain_crash(curr->domain);
>      }

I'm sorry to retract my R-by after the fact, but I've only just noticed
(while rebasing some of my pending work over this) that it is buggy.

The reason wait() was called is because it is not safe to leave that
if() clause.

With this change in place, we'll arrange for the VM to be crashed, then
longjump back into the stack from from the waiting vCPU, on the wrong
CPU.  Any caller with smp_processor_id() or thread-local variables cache
by pointer on the stack will then cause memory corruption.

Its not immediately obvious how to fix this, but bear in mind that as
soon as the vm-event interface is done, I plan to delete this whole
waitqueue infrastructure anyway.

~Andrew
Juergen Gross July 26, 2019, 9:53 a.m. UTC | #3
On 26.07.19 11:46, Andrew Cooper wrote:
> On 24/07/2019 12:26, Juergen Gross wrote:
>> diff --git a/xen/common/wait.c b/xen/common/wait.c
>> index 4f830a14e8..3fc5f68611 100644
>> --- a/xen/common/wait.c
>> +++ b/xen/common/wait.c
>> @@ -34,8 +34,6 @@ struct waitqueue_vcpu {
>>        */
>>       void *esp;
>>       char *stack;
>> -    cpumask_t saved_affinity;
>> -    unsigned int wakeup_cpu;
>>   #endif
>>   };
>>   
>> @@ -131,12 +129,10 @@ static void __prepare_to_wait(struct waitqueue_vcpu *wqv)
>>       ASSERT(wqv->esp == 0);
>>   
>>       /* Save current VCPU affinity; force wakeup on *this* CPU only. */
>> -    wqv->wakeup_cpu = smp_processor_id();
>> -    cpumask_copy(&wqv->saved_affinity, curr->cpu_hard_affinity);
>> -    if ( vcpu_set_hard_affinity(curr, cpumask_of(wqv->wakeup_cpu)) )
>> +    if ( vcpu_temporary_affinity(curr, smp_processor_id(), VCPU_AFFINITY_WAIT) )
>>       {
>>           gdprintk(XENLOG_ERR, "Unable to set vcpu affinity\n");
>> -        domain_crash(current->domain);
>> +        domain_crash(curr->domain);
>>   
>>           for ( ; ; )
>>               do_softirq();
>> @@ -170,7 +166,7 @@ static void __prepare_to_wait(struct waitqueue_vcpu *wqv)
>>       if ( unlikely(wqv->esp == 0) )
>>       {
>>           gdprintk(XENLOG_ERR, "Stack too large in %s\n", __func__);
>> -        domain_crash(current->domain);
>> +        domain_crash(curr->domain);
>>   
>>           for ( ; ; )
>>               do_softirq();
>> @@ -182,30 +178,24 @@ static void __prepare_to_wait(struct waitqueue_vcpu *wqv)
>>   static void __finish_wait(struct waitqueue_vcpu *wqv)
>>   {
>>       wqv->esp = NULL;
>> -    (void)vcpu_set_hard_affinity(current, &wqv->saved_affinity);
>> +    vcpu_temporary_affinity(current, NR_CPUS, VCPU_AFFINITY_WAIT);
>>   }
>>   
>>   void check_wakeup_from_wait(void)
>>   {
>> -    struct waitqueue_vcpu *wqv = current->waitqueue_vcpu;
>> +    struct vcpu *curr = current;
>> +    struct waitqueue_vcpu *wqv = curr->waitqueue_vcpu;
>>   
>>       ASSERT(list_empty(&wqv->list));
>>   
>>       if ( likely(wqv->esp == NULL) )
>>           return;
>>   
>> -    /* Check if we woke up on the wrong CPU. */
>> -    if ( unlikely(smp_processor_id() != wqv->wakeup_cpu) )
>> +    /* Check if we are still pinned. */
>> +    if ( unlikely(!(curr->affinity_broken & VCPU_AFFINITY_WAIT)) )
>>       {
>> -        /* Re-set VCPU affinity and re-enter the scheduler. */
>> -        struct vcpu *curr = current;
>> -        cpumask_copy(&wqv->saved_affinity, curr->cpu_hard_affinity);
>> -        if ( vcpu_set_hard_affinity(curr, cpumask_of(wqv->wakeup_cpu)) )
>> -        {
>> -            gdprintk(XENLOG_ERR, "Unable to set vcpu affinity\n");
>> -            domain_crash(current->domain);
>> -        }
>> -        wait(); /* takes us back into the scheduler */
>> +        gdprintk(XENLOG_ERR, "vcpu affinity lost\n");
>> +        domain_crash(curr->domain);
>>       }
> 
> I'm sorry to retract my R-by after the fact, but I've only just noticed
> (while rebasing some of my pending work over this) that it is buggy.
> 
> The reason wait() was called is because it is not safe to leave that
> if() clause.
> 
> With this change in place, we'll arrange for the VM to be crashed, then
> longjump back into the stack from from the waiting vCPU, on the wrong
> CPU.  Any caller with smp_processor_id() or thread-local variables cache
> by pointer on the stack will then cause memory corruption.
> 
> Its not immediately obvious how to fix this, but bear in mind that as
> soon as the vm-event interface is done, I plan to delete this whole
> waitqueue infrastructure anyway.

Shouldn't just calling wait() after domain_crash() be fine then?

That's what would have happened in the original error case, too.


Juergen
Andrew Cooper July 26, 2019, 10:13 a.m. UTC | #4
On 26/07/2019 10:53, Juergen Gross wrote:
> On 26.07.19 11:46, Andrew Cooper wrote:
>> On 24/07/2019 12:26, Juergen Gross wrote:
>>> diff --git a/xen/common/wait.c b/xen/common/wait.c
>>> index 4f830a14e8..3fc5f68611 100644
>>> --- a/xen/common/wait.c
>>> +++ b/xen/common/wait.c
>>> @@ -34,8 +34,6 @@ struct waitqueue_vcpu {
>>>        */
>>>       void *esp;
>>>       char *stack;
>>> -    cpumask_t saved_affinity;
>>> -    unsigned int wakeup_cpu;
>>>   #endif
>>>   };
>>>   @@ -131,12 +129,10 @@ static void __prepare_to_wait(struct
>>> waitqueue_vcpu *wqv)
>>>       ASSERT(wqv->esp == 0);
>>>         /* Save current VCPU affinity; force wakeup on *this* CPU
>>> only. */
>>> -    wqv->wakeup_cpu = smp_processor_id();
>>> -    cpumask_copy(&wqv->saved_affinity, curr->cpu_hard_affinity);
>>> -    if ( vcpu_set_hard_affinity(curr, cpumask_of(wqv->wakeup_cpu)) )
>>> +    if ( vcpu_temporary_affinity(curr, smp_processor_id(),
>>> VCPU_AFFINITY_WAIT) )
>>>       {
>>>           gdprintk(XENLOG_ERR, "Unable to set vcpu affinity\n");
>>> -        domain_crash(current->domain);
>>> +        domain_crash(curr->domain);
>>>             for ( ; ; )
>>>               do_softirq();
>>> @@ -170,7 +166,7 @@ static void __prepare_to_wait(struct
>>> waitqueue_vcpu *wqv)
>>>       if ( unlikely(wqv->esp == 0) )
>>>       {
>>>           gdprintk(XENLOG_ERR, "Stack too large in %s\n", __func__);
>>> -        domain_crash(current->domain);
>>> +        domain_crash(curr->domain);
>>>             for ( ; ; )
>>>               do_softirq();
>>> @@ -182,30 +178,24 @@ static void __prepare_to_wait(struct
>>> waitqueue_vcpu *wqv)
>>>   static void __finish_wait(struct waitqueue_vcpu *wqv)
>>>   {
>>>       wqv->esp = NULL;
>>> -    (void)vcpu_set_hard_affinity(current, &wqv->saved_affinity);
>>> +    vcpu_temporary_affinity(current, NR_CPUS, VCPU_AFFINITY_WAIT);
>>>   }
>>>     void check_wakeup_from_wait(void)
>>>   {
>>> -    struct waitqueue_vcpu *wqv = current->waitqueue_vcpu;
>>> +    struct vcpu *curr = current;
>>> +    struct waitqueue_vcpu *wqv = curr->waitqueue_vcpu;
>>>         ASSERT(list_empty(&wqv->list));
>>>         if ( likely(wqv->esp == NULL) )
>>>           return;
>>>   -    /* Check if we woke up on the wrong CPU. */
>>> -    if ( unlikely(smp_processor_id() != wqv->wakeup_cpu) )
>>> +    /* Check if we are still pinned. */
>>> +    if ( unlikely(!(curr->affinity_broken & VCPU_AFFINITY_WAIT)) )
>>>       {
>>> -        /* Re-set VCPU affinity and re-enter the scheduler. */
>>> -        struct vcpu *curr = current;
>>> -        cpumask_copy(&wqv->saved_affinity, curr->cpu_hard_affinity);
>>> -        if ( vcpu_set_hard_affinity(curr,
>>> cpumask_of(wqv->wakeup_cpu)) )
>>> -        {
>>> -            gdprintk(XENLOG_ERR, "Unable to set vcpu affinity\n");
>>> -            domain_crash(current->domain);
>>> -        }
>>> -        wait(); /* takes us back into the scheduler */
>>> +        gdprintk(XENLOG_ERR, "vcpu affinity lost\n");
>>> +        domain_crash(curr->domain);
>>>       }
>>
>> I'm sorry to retract my R-by after the fact, but I've only just noticed
>> (while rebasing some of my pending work over this) that it is buggy.
>>
>> The reason wait() was called is because it is not safe to leave that
>> if() clause.
>>
>> With this change in place, we'll arrange for the VM to be crashed, then
>> longjump back into the stack from from the waiting vCPU, on the wrong
>> CPU.  Any caller with smp_processor_id() or thread-local variables cache
>> by pointer on the stack will then cause memory corruption.
>>
>> Its not immediately obvious how to fix this, but bear in mind that as
>> soon as the vm-event interface is done, I plan to delete this whole
>> waitqueue infrastructure anyway.
>
> Shouldn't just calling wait() after domain_crash() be fine then?
>
> That's what would have happened in the original error case, too.

No - I don't think so.  That was to try and get back into a position
where the scheduler rescheduled this vcpu on the correct cpu, so it
could safely longjmp back into context.

With the domain crash here, nothing will happen[1] until we do
successfully longjmp() back into context, because we've got a stack
frame which needs unwinding before it is safe to start cleaning the
domain up.

~Andrew

[1] If something other than nothing happens, then we've got a
refcounting issue...
Juergen Gross July 26, 2019, 10:19 a.m. UTC | #5
On 26.07.19 12:13, Andrew Cooper wrote:
> On 26/07/2019 10:53, Juergen Gross wrote:
>> On 26.07.19 11:46, Andrew Cooper wrote:
>>> On 24/07/2019 12:26, Juergen Gross wrote:
>>>> diff --git a/xen/common/wait.c b/xen/common/wait.c
>>>> index 4f830a14e8..3fc5f68611 100644
>>>> --- a/xen/common/wait.c
>>>> +++ b/xen/common/wait.c
>>>> @@ -34,8 +34,6 @@ struct waitqueue_vcpu {
>>>>         */
>>>>        void *esp;
>>>>        char *stack;
>>>> -    cpumask_t saved_affinity;
>>>> -    unsigned int wakeup_cpu;
>>>>    #endif
>>>>    };
>>>>    @@ -131,12 +129,10 @@ static void __prepare_to_wait(struct
>>>> waitqueue_vcpu *wqv)
>>>>        ASSERT(wqv->esp == 0);
>>>>          /* Save current VCPU affinity; force wakeup on *this* CPU
>>>> only. */
>>>> -    wqv->wakeup_cpu = smp_processor_id();
>>>> -    cpumask_copy(&wqv->saved_affinity, curr->cpu_hard_affinity);
>>>> -    if ( vcpu_set_hard_affinity(curr, cpumask_of(wqv->wakeup_cpu)) )
>>>> +    if ( vcpu_temporary_affinity(curr, smp_processor_id(),
>>>> VCPU_AFFINITY_WAIT) )
>>>>        {
>>>>            gdprintk(XENLOG_ERR, "Unable to set vcpu affinity\n");
>>>> -        domain_crash(current->domain);
>>>> +        domain_crash(curr->domain);
>>>>              for ( ; ; )
>>>>                do_softirq();
>>>> @@ -170,7 +166,7 @@ static void __prepare_to_wait(struct
>>>> waitqueue_vcpu *wqv)
>>>>        if ( unlikely(wqv->esp == 0) )
>>>>        {
>>>>            gdprintk(XENLOG_ERR, "Stack too large in %s\n", __func__);
>>>> -        domain_crash(current->domain);
>>>> +        domain_crash(curr->domain);
>>>>              for ( ; ; )
>>>>                do_softirq();
>>>> @@ -182,30 +178,24 @@ static void __prepare_to_wait(struct
>>>> waitqueue_vcpu *wqv)
>>>>    static void __finish_wait(struct waitqueue_vcpu *wqv)
>>>>    {
>>>>        wqv->esp = NULL;
>>>> -    (void)vcpu_set_hard_affinity(current, &wqv->saved_affinity);
>>>> +    vcpu_temporary_affinity(current, NR_CPUS, VCPU_AFFINITY_WAIT);
>>>>    }
>>>>      void check_wakeup_from_wait(void)
>>>>    {
>>>> -    struct waitqueue_vcpu *wqv = current->waitqueue_vcpu;
>>>> +    struct vcpu *curr = current;
>>>> +    struct waitqueue_vcpu *wqv = curr->waitqueue_vcpu;
>>>>          ASSERT(list_empty(&wqv->list));
>>>>          if ( likely(wqv->esp == NULL) )
>>>>            return;
>>>>    -    /* Check if we woke up on the wrong CPU. */
>>>> -    if ( unlikely(smp_processor_id() != wqv->wakeup_cpu) )
>>>> +    /* Check if we are still pinned. */
>>>> +    if ( unlikely(!(curr->affinity_broken & VCPU_AFFINITY_WAIT)) )
>>>>        {
>>>> -        /* Re-set VCPU affinity and re-enter the scheduler. */
>>>> -        struct vcpu *curr = current;
>>>> -        cpumask_copy(&wqv->saved_affinity, curr->cpu_hard_affinity);
>>>> -        if ( vcpu_set_hard_affinity(curr,
>>>> cpumask_of(wqv->wakeup_cpu)) )
>>>> -        {
>>>> -            gdprintk(XENLOG_ERR, "Unable to set vcpu affinity\n");
>>>> -            domain_crash(current->domain);
>>>> -        }
>>>> -        wait(); /* takes us back into the scheduler */
>>>> +        gdprintk(XENLOG_ERR, "vcpu affinity lost\n");
>>>> +        domain_crash(curr->domain);
>>>>        }
>>>
>>> I'm sorry to retract my R-by after the fact, but I've only just noticed
>>> (while rebasing some of my pending work over this) that it is buggy.
>>>
>>> The reason wait() was called is because it is not safe to leave that
>>> if() clause.
>>>
>>> With this change in place, we'll arrange for the VM to be crashed, then
>>> longjump back into the stack from from the waiting vCPU, on the wrong
>>> CPU.  Any caller with smp_processor_id() or thread-local variables cache
>>> by pointer on the stack will then cause memory corruption.
>>>
>>> Its not immediately obvious how to fix this, but bear in mind that as
>>> soon as the vm-event interface is done, I plan to delete this whole
>>> waitqueue infrastructure anyway.
>>
>> Shouldn't just calling wait() after domain_crash() be fine then?
>>
>> That's what would have happened in the original error case, too.
> 
> No - I don't think so.  That was to try and get back into a position
> where the scheduler rescheduled this vcpu on the correct cpu, so it
> could safely longjmp back into context.

But there was a domain_crash() in the code I removed.

In case this already was a problem then I guess the domain_crash()
might need to be replaced by panic(). The only case I'm aware of where
this situation could arise would be a suspend/resume cycle where
wait_event() was active and not all cpus came up again on resume.
That seems to be quite improbable.


Juergen
Jan Beulich July 26, 2019, 11:35 a.m. UTC | #6
On 26.07.2019 11:46, Andrew Cooper wrote:
> On 24/07/2019 12:26, Juergen Gross wrote:
>> @@ -182,30 +178,24 @@ static void __prepare_to_wait(struct waitqueue_vcpu *wqv)
>>   static void __finish_wait(struct waitqueue_vcpu *wqv)
>>   {
>>       wqv->esp = NULL;
>> -    (void)vcpu_set_hard_affinity(current, &wqv->saved_affinity);
>> +    vcpu_temporary_affinity(current, NR_CPUS, VCPU_AFFINITY_WAIT);
>>   }
>>   
>>   void check_wakeup_from_wait(void)
>>   {
>> -    struct waitqueue_vcpu *wqv = current->waitqueue_vcpu;
>> +    struct vcpu *curr = current;
>> +    struct waitqueue_vcpu *wqv = curr->waitqueue_vcpu;
>>   
>>       ASSERT(list_empty(&wqv->list));
>>   
>>       if ( likely(wqv->esp == NULL) )
>>           return;
>>   
>> -    /* Check if we woke up on the wrong CPU. */
>> -    if ( unlikely(smp_processor_id() != wqv->wakeup_cpu) )
>> +    /* Check if we are still pinned. */
>> +    if ( unlikely(!(curr->affinity_broken & VCPU_AFFINITY_WAIT)) )
>>       {
>> -        /* Re-set VCPU affinity and re-enter the scheduler. */
>> -        struct vcpu *curr = current;
>> -        cpumask_copy(&wqv->saved_affinity, curr->cpu_hard_affinity);
>> -        if ( vcpu_set_hard_affinity(curr, cpumask_of(wqv->wakeup_cpu)) )
>> -        {
>> -            gdprintk(XENLOG_ERR, "Unable to set vcpu affinity\n");
>> -            domain_crash(current->domain);
>> -        }
>> -        wait(); /* takes us back into the scheduler */
>> +        gdprintk(XENLOG_ERR, "vcpu affinity lost\n");
>> +        domain_crash(curr->domain);
>>       }
> 
> I'm sorry to retract my R-by after the fact, but I've only just noticed
> (while rebasing some of my pending work over this) that it is buggy.
> 
> The reason wait() was called is because it is not safe to leave that
> if() clause.
> 
> With this change in place, we'll arrange for the VM to be crashed, then
> longjump back into the stack from from the waiting vCPU, on the wrong
> CPU.  Any caller with smp_processor_id() or thread-local variables cache
> by pointer on the stack will then cause memory corruption.
> 
> Its not immediately obvious how to fix this, but bear in mind that as
> soon as the vm-event interface is done, I plan to delete this whole
> waitqueue infrastructure anyway.

In which case - should we revert the commit until this is resolved?

Jan
Juergen Gross July 26, 2019, 11:42 a.m. UTC | #7
On 26.07.19 13:35, Jan Beulich wrote:
> On 26.07.2019 11:46, Andrew Cooper wrote:
>> On 24/07/2019 12:26, Juergen Gross wrote:
>>> @@ -182,30 +178,24 @@ static void __prepare_to_wait(struct waitqueue_vcpu *wqv)
>>>    static void __finish_wait(struct waitqueue_vcpu *wqv)
>>>    {
>>>        wqv->esp = NULL;
>>> -    (void)vcpu_set_hard_affinity(current, &wqv->saved_affinity);
>>> +    vcpu_temporary_affinity(current, NR_CPUS, VCPU_AFFINITY_WAIT);
>>>    }
>>>    
>>>    void check_wakeup_from_wait(void)
>>>    {
>>> -    struct waitqueue_vcpu *wqv = current->waitqueue_vcpu;
>>> +    struct vcpu *curr = current;
>>> +    struct waitqueue_vcpu *wqv = curr->waitqueue_vcpu;
>>>    
>>>        ASSERT(list_empty(&wqv->list));
>>>    
>>>        if ( likely(wqv->esp == NULL) )
>>>            return;
>>>    
>>> -    /* Check if we woke up on the wrong CPU. */
>>> -    if ( unlikely(smp_processor_id() != wqv->wakeup_cpu) )
>>> +    /* Check if we are still pinned. */
>>> +    if ( unlikely(!(curr->affinity_broken & VCPU_AFFINITY_WAIT)) )
>>>        {
>>> -        /* Re-set VCPU affinity and re-enter the scheduler. */
>>> -        struct vcpu *curr = current;
>>> -        cpumask_copy(&wqv->saved_affinity, curr->cpu_hard_affinity);
>>> -        if ( vcpu_set_hard_affinity(curr, cpumask_of(wqv->wakeup_cpu)) )
>>> -        {
>>> -            gdprintk(XENLOG_ERR, "Unable to set vcpu affinity\n");
>>> -            domain_crash(current->domain);
>>> -        }
>>> -        wait(); /* takes us back into the scheduler */
>>> +        gdprintk(XENLOG_ERR, "vcpu affinity lost\n");
>>> +        domain_crash(curr->domain);
>>>        }
>>
>> I'm sorry to retract my R-by after the fact, but I've only just noticed
>> (while rebasing some of my pending work over this) that it is buggy.
>>
>> The reason wait() was called is because it is not safe to leave that
>> if() clause.
>>
>> With this change in place, we'll arrange for the VM to be crashed, then
>> longjump back into the stack from from the waiting vCPU, on the wrong
>> CPU.  Any caller with smp_processor_id() or thread-local variables cache
>> by pointer on the stack will then cause memory corruption.
>>
>> Its not immediately obvious how to fix this, but bear in mind that as
>> soon as the vm-event interface is done, I plan to delete this whole
>> waitqueue infrastructure anyway.
> 
> In which case - should we revert the commit until this is resolved?

In my opinion it is not that urgent. I don't think any of our OSStests
will ever be able to trigger this issue, as AFAIK no test is using the
wait_event() interface nor do they test suspend/resume. And both need
to be true (at the same time!) plus a cpu needs to fail coming up when
resuming again.


Juergen
Andrew Cooper July 26, 2019, 11:50 a.m. UTC | #8
On 26/07/2019 12:42, Juergen Gross wrote:
> On 26.07.19 13:35, Jan Beulich wrote:
>> On 26.07.2019 11:46, Andrew Cooper wrote:
>>> On 24/07/2019 12:26, Juergen Gross wrote:
>>>> @@ -182,30 +178,24 @@ static void __prepare_to_wait(struct
>>>> waitqueue_vcpu *wqv)
>>>>    static void __finish_wait(struct waitqueue_vcpu *wqv)
>>>>    {
>>>>        wqv->esp = NULL;
>>>> -    (void)vcpu_set_hard_affinity(current, &wqv->saved_affinity);
>>>> +    vcpu_temporary_affinity(current, NR_CPUS, VCPU_AFFINITY_WAIT);
>>>>    }
>>>>       void check_wakeup_from_wait(void)
>>>>    {
>>>> -    struct waitqueue_vcpu *wqv = current->waitqueue_vcpu;
>>>> +    struct vcpu *curr = current;
>>>> +    struct waitqueue_vcpu *wqv = curr->waitqueue_vcpu;
>>>>           ASSERT(list_empty(&wqv->list));
>>>>           if ( likely(wqv->esp == NULL) )
>>>>            return;
>>>>    -    /* Check if we woke up on the wrong CPU. */
>>>> -    if ( unlikely(smp_processor_id() != wqv->wakeup_cpu) )
>>>> +    /* Check if we are still pinned. */
>>>> +    if ( unlikely(!(curr->affinity_broken & VCPU_AFFINITY_WAIT)) )
>>>>        {
>>>> -        /* Re-set VCPU affinity and re-enter the scheduler. */
>>>> -        struct vcpu *curr = current;
>>>> -        cpumask_copy(&wqv->saved_affinity, curr->cpu_hard_affinity);
>>>> -        if ( vcpu_set_hard_affinity(curr,
>>>> cpumask_of(wqv->wakeup_cpu)) )
>>>> -        {
>>>> -            gdprintk(XENLOG_ERR, "Unable to set vcpu affinity\n");
>>>> -            domain_crash(current->domain);
>>>> -        }
>>>> -        wait(); /* takes us back into the scheduler */
>>>> +        gdprintk(XENLOG_ERR, "vcpu affinity lost\n");
>>>> +        domain_crash(curr->domain);
>>>>        }
>>>
>>> I'm sorry to retract my R-by after the fact, but I've only just noticed
>>> (while rebasing some of my pending work over this) that it is buggy.
>>>
>>> The reason wait() was called is because it is not safe to leave that
>>> if() clause.
>>>
>>> With this change in place, we'll arrange for the VM to be crashed, then
>>> longjump back into the stack from from the waiting vCPU, on the wrong
>>> CPU.  Any caller with smp_processor_id() or thread-local variables
>>> cache
>>> by pointer on the stack will then cause memory corruption.
>>>
>>> Its not immediately obvious how to fix this, but bear in mind that as
>>> soon as the vm-event interface is done, I plan to delete this whole
>>> waitqueue infrastructure anyway.
>>
>> In which case - should we revert the commit until this is resolved?
>
> In my opinion it is not that urgent. I don't think any of our OSStests
> will ever be able to trigger this issue, as AFAIK no test is using the
> wait_event() interface nor do they test suspend/resume. And both need
> to be true (at the same time!) plus a cpu needs to fail coming up when
> resuming again.

Yeah - I don't think reverting it is necessary, but I will flag
"resolving this somehow" as a 4.12 blocker.

The HVI scale tests trigger this path.  Guess how I discovered that
Introspection + Livepatching = boom.

I am leaning on the side of panic().  I agree that if the APIs are used
correctly, it can't occur.

~Andrew
Juergen Gross July 28, 2019, 9:24 a.m. UTC | #9
On 26.07.19 13:50, Andrew Cooper wrote:
> On 26/07/2019 12:42, Juergen Gross wrote:
>> On 26.07.19 13:35, Jan Beulich wrote:
>>> On 26.07.2019 11:46, Andrew Cooper wrote:
>>>> On 24/07/2019 12:26, Juergen Gross wrote:
>>>>> @@ -182,30 +178,24 @@ static void __prepare_to_wait(struct
>>>>> waitqueue_vcpu *wqv)
>>>>>     static void __finish_wait(struct waitqueue_vcpu *wqv)
>>>>>     {
>>>>>         wqv->esp = NULL;
>>>>> -    (void)vcpu_set_hard_affinity(current, &wqv->saved_affinity);
>>>>> +    vcpu_temporary_affinity(current, NR_CPUS, VCPU_AFFINITY_WAIT);
>>>>>     }
>>>>>        void check_wakeup_from_wait(void)
>>>>>     {
>>>>> -    struct waitqueue_vcpu *wqv = current->waitqueue_vcpu;
>>>>> +    struct vcpu *curr = current;
>>>>> +    struct waitqueue_vcpu *wqv = curr->waitqueue_vcpu;
>>>>>            ASSERT(list_empty(&wqv->list));
>>>>>            if ( likely(wqv->esp == NULL) )
>>>>>             return;
>>>>>     -    /* Check if we woke up on the wrong CPU. */
>>>>> -    if ( unlikely(smp_processor_id() != wqv->wakeup_cpu) )
>>>>> +    /* Check if we are still pinned. */
>>>>> +    if ( unlikely(!(curr->affinity_broken & VCPU_AFFINITY_WAIT)) )
>>>>>         {
>>>>> -        /* Re-set VCPU affinity and re-enter the scheduler. */
>>>>> -        struct vcpu *curr = current;
>>>>> -        cpumask_copy(&wqv->saved_affinity, curr->cpu_hard_affinity);
>>>>> -        if ( vcpu_set_hard_affinity(curr,
>>>>> cpumask_of(wqv->wakeup_cpu)) )
>>>>> -        {
>>>>> -            gdprintk(XENLOG_ERR, "Unable to set vcpu affinity\n");
>>>>> -            domain_crash(current->domain);
>>>>> -        }
>>>>> -        wait(); /* takes us back into the scheduler */
>>>>> +        gdprintk(XENLOG_ERR, "vcpu affinity lost\n");
>>>>> +        domain_crash(curr->domain);
>>>>>         }
>>>>
>>>> I'm sorry to retract my R-by after the fact, but I've only just noticed
>>>> (while rebasing some of my pending work over this) that it is buggy.
>>>>
>>>> The reason wait() was called is because it is not safe to leave that
>>>> if() clause.
>>>>
>>>> With this change in place, we'll arrange for the VM to be crashed, then
>>>> longjump back into the stack from from the waiting vCPU, on the wrong
>>>> CPU.  Any caller with smp_processor_id() or thread-local variables
>>>> cache
>>>> by pointer on the stack will then cause memory corruption.
>>>>
>>>> Its not immediately obvious how to fix this, but bear in mind that as
>>>> soon as the vm-event interface is done, I plan to delete this whole
>>>> waitqueue infrastructure anyway.
>>>
>>> In which case - should we revert the commit until this is resolved?
>>
>> In my opinion it is not that urgent. I don't think any of our OSStests
>> will ever be able to trigger this issue, as AFAIK no test is using the
>> wait_event() interface nor do they test suspend/resume. And both need
>> to be true (at the same time!) plus a cpu needs to fail coming up when
>> resuming again.
> 
> Yeah - I don't think reverting it is necessary, but I will flag
> "resolving this somehow" as a 4.12 blocker.
> 
> The HVI scale tests trigger this path.  Guess how I discovered that
> Introspection + Livepatching = boom.
> 
> I am leaning on the side of panic().  I agree that if the APIs are used
> correctly, it can't occur.

Hmm, shouldn't

    domain_crash();
    raise_softirq(SCHEDULE_SOFTIRQ);
    return;

in check_wakeup_from_wait() just work?


Juergen
diff mbox series

Patch

diff --git a/xen/common/domain.c b/xen/common/domain.c
index bc56a51815..e8e850796e 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -1267,6 +1267,7 @@  int vcpu_reset(struct vcpu *v)
     v->async_exception_mask = 0;
     memset(v->async_exception_state, 0, sizeof(v->async_exception_state));
 #endif
+    v->affinity_broken = 0;
     clear_bit(_VPF_blocked, &v->pause_flags);
     clear_bit(_VPF_in_reset, &v->pause_flags);
 
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
index 72a44953d0..fa260ce5fb 100644
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -654,7 +654,7 @@  long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 
             /* Undo a stuck SCHED_pin_override? */
             if ( vcpuaff->flags & XEN_VCPUAFFINITY_FORCE )
-                vcpu_pin_override(v, -1);
+                vcpu_temporary_affinity(v, NR_CPUS, VCPU_AFFINITY_OVERRIDE);
 
             ret = 0;
 
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 349f9624f5..130b97d875 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -1106,43 +1106,59 @@  void watchdog_domain_destroy(struct domain *d)
         kill_timer(&d->watchdog_timer[i]);
 }
 
-int vcpu_pin_override(struct vcpu *v, int cpu)
+/*
+ * Pin a vcpu temporarily to a specific CPU (or restore old pinning state if
+ * cpu is NR_CPUS).
+ * Temporary pinning can be done due to two reasons, which may be nested:
+ * - VCPU_AFFINITY_OVERRIDE (requested by guest): is allowed to fail in case
+ *   of a conflict (e.g. in case cpupool doesn't include requested CPU, or
+ *   another conflicting temporary pinning is already in effect.
+ * - VCPU_AFFINITY_WAIT (called by wait_event()): only used to pin vcpu to the
+ *   CPU it is just running on. Can't fail if used properly.
+ */
+int vcpu_temporary_affinity(struct vcpu *v, unsigned int cpu, uint8_t reason)
 {
     spinlock_t *lock;
     int ret = -EINVAL;
+    bool migrate;
 
     lock = vcpu_schedule_lock_irq(v);
 
-    if ( cpu < 0 )
+    if ( cpu == NR_CPUS )
     {
-        if ( v->affinity_broken )
+        if ( v->affinity_broken & reason )
         {
-            sched_set_affinity(v, v->cpu_hard_affinity_saved, NULL);
-            v->affinity_broken = 0;
             ret = 0;
+            v->affinity_broken &= ~reason;
         }
+        if ( !ret && !v->affinity_broken )
+            sched_set_affinity(v, v->cpu_hard_affinity_saved, NULL);
     }
     else if ( cpu < nr_cpu_ids )
     {
-        if ( v->affinity_broken )
+        if ( (v->affinity_broken & reason) ||
+             (v->affinity_broken && v->processor != cpu) )
             ret = -EBUSY;
         else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) )
         {
-            cpumask_copy(v->cpu_hard_affinity_saved, v->cpu_hard_affinity);
-            v->affinity_broken = 1;
-            sched_set_affinity(v, cpumask_of(cpu), NULL);
+            if ( !v->affinity_broken )
+            {
+                cpumask_copy(v->cpu_hard_affinity_saved, v->cpu_hard_affinity);
+                sched_set_affinity(v, cpumask_of(cpu), NULL);
+            }
+            v->affinity_broken |= reason;
             ret = 0;
         }
     }
 
-    if ( ret == 0 )
+    migrate = !ret && !cpumask_test_cpu(v->processor, v->cpu_hard_affinity);
+    if ( migrate )
         vcpu_migrate_start(v);
 
     vcpu_schedule_unlock_irq(lock, v);
 
-    domain_update_node_affinity(v->domain);
-
-    vcpu_migrate_finish(v);
+    if ( migrate )
+        vcpu_migrate_finish(v);
 
     return ret;
 }
@@ -1258,6 +1274,7 @@  ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
     case SCHEDOP_pin_override:
     {
         struct sched_pin_override sched_pin_override;
+        unsigned int cpu;
 
         ret = -EPERM;
         if ( !is_hardware_domain(current->domain) )
@@ -1267,7 +1284,12 @@  ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         if ( copy_from_guest(&sched_pin_override, arg, 1) )
             break;
 
-        ret = vcpu_pin_override(current, sched_pin_override.pcpu);
+        ret = -EINVAL;
+        if ( sched_pin_override.pcpu >= NR_CPUS )
+           break;
+
+        cpu = sched_pin_override.pcpu < 0 ? NR_CPUS : sched_pin_override.pcpu;
+        ret = vcpu_temporary_affinity(current, cpu, VCPU_AFFINITY_OVERRIDE);
 
         break;
     }
diff --git a/xen/common/wait.c b/xen/common/wait.c
index 4f830a14e8..3fc5f68611 100644
--- a/xen/common/wait.c
+++ b/xen/common/wait.c
@@ -34,8 +34,6 @@  struct waitqueue_vcpu {
      */
     void *esp;
     char *stack;
-    cpumask_t saved_affinity;
-    unsigned int wakeup_cpu;
 #endif
 };
 
@@ -131,12 +129,10 @@  static void __prepare_to_wait(struct waitqueue_vcpu *wqv)
     ASSERT(wqv->esp == 0);
 
     /* Save current VCPU affinity; force wakeup on *this* CPU only. */
-    wqv->wakeup_cpu = smp_processor_id();
-    cpumask_copy(&wqv->saved_affinity, curr->cpu_hard_affinity);
-    if ( vcpu_set_hard_affinity(curr, cpumask_of(wqv->wakeup_cpu)) )
+    if ( vcpu_temporary_affinity(curr, smp_processor_id(), VCPU_AFFINITY_WAIT) )
     {
         gdprintk(XENLOG_ERR, "Unable to set vcpu affinity\n");
-        domain_crash(current->domain);
+        domain_crash(curr->domain);
 
         for ( ; ; )
             do_softirq();
@@ -170,7 +166,7 @@  static void __prepare_to_wait(struct waitqueue_vcpu *wqv)
     if ( unlikely(wqv->esp == 0) )
     {
         gdprintk(XENLOG_ERR, "Stack too large in %s\n", __func__);
-        domain_crash(current->domain);
+        domain_crash(curr->domain);
 
         for ( ; ; )
             do_softirq();
@@ -182,30 +178,24 @@  static void __prepare_to_wait(struct waitqueue_vcpu *wqv)
 static void __finish_wait(struct waitqueue_vcpu *wqv)
 {
     wqv->esp = NULL;
-    (void)vcpu_set_hard_affinity(current, &wqv->saved_affinity);
+    vcpu_temporary_affinity(current, NR_CPUS, VCPU_AFFINITY_WAIT);
 }
 
 void check_wakeup_from_wait(void)
 {
-    struct waitqueue_vcpu *wqv = current->waitqueue_vcpu;
+    struct vcpu *curr = current;
+    struct waitqueue_vcpu *wqv = curr->waitqueue_vcpu;
 
     ASSERT(list_empty(&wqv->list));
 
     if ( likely(wqv->esp == NULL) )
         return;
 
-    /* Check if we woke up on the wrong CPU. */
-    if ( unlikely(smp_processor_id() != wqv->wakeup_cpu) )
+    /* Check if we are still pinned. */
+    if ( unlikely(!(curr->affinity_broken & VCPU_AFFINITY_WAIT)) )
     {
-        /* Re-set VCPU affinity and re-enter the scheduler. */
-        struct vcpu *curr = current;
-        cpumask_copy(&wqv->saved_affinity, curr->cpu_hard_affinity);
-        if ( vcpu_set_hard_affinity(curr, cpumask_of(wqv->wakeup_cpu)) )
-        {
-            gdprintk(XENLOG_ERR, "Unable to set vcpu affinity\n");
-            domain_crash(current->domain);
-        }
-        wait(); /* takes us back into the scheduler */
+        gdprintk(XENLOG_ERR, "vcpu affinity lost\n");
+        domain_crash(curr->domain);
     }
 
     /*
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index c197e93d73..9578628c6a 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -200,7 +200,9 @@  struct vcpu
     /* VCPU is paused following shutdown request (d->is_shutting_down)? */
     bool             paused_for_shutdown;
     /* VCPU need affinity restored */
-    bool             affinity_broken;
+    uint8_t          affinity_broken;
+#define VCPU_AFFINITY_OVERRIDE    0x01
+#define VCPU_AFFINITY_WAIT        0x02
 
     /* A hypercall has been preempted. */
     bool             hcall_preempted;
@@ -245,7 +247,7 @@  struct vcpu
 
     /* Bitmask of CPUs on which this VCPU may run. */
     cpumask_var_t    cpu_hard_affinity;
-    /* Used to restore affinity across S3. */
+    /* Used to save affinity during temporary pinning. */
     cpumask_var_t    cpu_hard_affinity_saved;
 
     /* Bitmask of CPUs on which this VCPU prefers to run. */
@@ -873,10 +875,10 @@  int cpu_disable_scheduler(unsigned int cpu);
 /* We need it in dom0_setup_vcpu */
 void sched_set_affinity(struct vcpu *v, const cpumask_t *hard,
                         const cpumask_t *soft);
+int vcpu_temporary_affinity(struct vcpu *v, unsigned int cpu, uint8_t reason);
 int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity);
 int vcpu_set_soft_affinity(struct vcpu *v, const cpumask_t *affinity);
 void restore_vcpu_affinity(struct domain *d);
-int vcpu_pin_override(struct vcpu *v, int cpu);
 
 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
 uint64_t get_cpu_idle_time(unsigned int cpu);