diff mbox series

[v3,5/5] x86: add accessors for scratch cpu mask

Message ID 20200224104645.96381-6-roger.pau@citrix.com (mailing list archive)
State New, archived
Headers show
Series x86: fixes/improvements for scratch cpumask | expand

Commit Message

Roger Pau Monné Feb. 24, 2020, 10:46 a.m. UTC
Current usage of the per-CPU scratch cpumask is dangerous since
there's no way to figure out if the mask is already being used except
for manual code inspection of all the callers and possible call paths.

This is unsafe and not reliable, so introduce a minimal get/put
infrastructure to prevent nested usage of the scratch mask and usage
in interrupt context.

Move the declaration of scratch_cpumask to smp.c in order to place the
declaration and the accessors as close as possible.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Changes since v1:
 - Use __builtin_return_address(0) instead of __func__.
 - Move declaration of scratch_cpumask and scratch_cpumask accessor to
   smp.c.
 - Do not allow usage in #MC or #NMI context.
---
 xen/arch/x86/io_apic.c    |  6 ++++--
 xen/arch/x86/irq.c        | 13 ++++++++++---
 xen/arch/x86/mm.c         | 30 +++++++++++++++++++++---------
 xen/arch/x86/msi.c        |  4 +++-
 xen/arch/x86/smp.c        | 25 +++++++++++++++++++++++++
 xen/arch/x86/smpboot.c    |  1 -
 xen/include/asm-x86/smp.h | 10 ++++++++++
 7 files changed, 73 insertions(+), 16 deletions(-)

Comments

Jan Beulich Feb. 26, 2020, 10:36 a.m. UTC | #1
On 24.02.2020 11:46, Roger Pau Monne wrote:
> Current usage of the per-CPU scratch cpumask is dangerous since
> there's no way to figure out if the mask is already being used except
> for manual code inspection of all the callers and possible call paths.
> 
> This is unsafe and not reliable, so introduce a minimal get/put
> infrastructure to prevent nested usage of the scratch mask and usage
> in interrupt context.

While I can see the reasoning (especially in light of the change
which did violate to assumption), I'm still uncertain if this isn't
"over-engineering". Andrew, do you have a clear opinion one way or
the other here?

> Move the declaration of scratch_cpumask to smp.c in order to place the
> declaration and the accessors as close as possible.

s/declaration/definition/g

> --- a/xen/arch/x86/irq.c
> +++ b/xen/arch/x86/irq.c
> @@ -196,7 +196,7 @@ static void _clear_irq_vector(struct irq_desc *desc)
>  {
>      unsigned int cpu, old_vector, irq = desc->irq;
>      unsigned int vector = desc->arch.vector;
> -    cpumask_t *tmp_mask = this_cpu(scratch_cpumask);
> +    cpumask_t *tmp_mask = get_scratch_cpumask();
>  
>      BUG_ON(!valid_irq_vector(vector));
>  
> @@ -223,7 +223,10 @@ static void _clear_irq_vector(struct irq_desc *desc)
>      trace_irq_mask(TRC_HW_IRQ_CLEAR_VECTOR, irq, vector, tmp_mask);
>  
>      if ( likely(!desc->arch.move_in_progress) )
> +    {
> +        put_scratch_cpumask();
>          return;
> +    }

I think if possible such error path adjustments would better be
avoided. And this seems feasible here: There are two entirely
independent used of the scratch mask in this function. You could
therefore put the mask above from here, and get it again further
down, or you could leverage a property of the current
implementation, plus the fact that the 2nd use doesn't involved
any "real" function calls, and avoid a 2nd get/put altogether.

Of course another question then is whether it is a good property
of the current model, i.e. whether it wouldn't be better for
"put" to actually zap the pointer, to prevent subsequent use.

> @@ -2531,12 +2536,12 @@ void fixup_irqs(const cpumask_t *mask, bool verbose)
>      unsigned int irq;
>      static int warned;
>      struct irq_desc *desc;
> +    cpumask_t *affinity = get_scratch_cpumask();
>  
>      for ( irq = 0; irq < nr_irqs; irq++ )
>      {
>          bool break_affinity = false, set_affinity = true;
>          unsigned int vector;
> -        cpumask_t *affinity = this_cpu(scratch_cpumask);
>  
>          if ( irq == 2 )
>              continue;
> @@ -2640,6 +2645,8 @@ void fixup_irqs(const cpumask_t *mask, bool verbose)
>                     irq, CPUMASK_PR(affinity));
>      }
>  
> +    put_scratch_cpumask();

Just as a remark, not necessarily as a request to change the code: I
wonder if down the road this pretty wide scope of "holding" the mask
isn't going to bite us, when a function called from here (in a range
of code not actively needing the mask) also may want to use the mask.
But of course we can make this finer grained at the point where it
might actually start mattering.

> @@ -3645,12 +3647,17 @@ long do_mmuext_op(
>                                     mask)) )
>                  rc = -EINVAL;
>              if ( unlikely(rc) )
> +            {
> +                put_scratch_cpumask();
>                  break;
> +            }

Again, instead of adjusting an error path, how about making this
have an empty statement (i.e. dropping the break) and ...

>              if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )

... having this become "else if()"?

> @@ -4384,6 +4393,9 @@ static int __do_update_va_mapping(
>          break;
>      }
>  
> +    if ( mask && mask != d->dirty_cpumask )
> +        put_scratch_cpumask();

The right side of the && here makes things feel a little fragile for
me.

> --- a/xen/arch/x86/msi.c
> +++ b/xen/arch/x86/msi.c
> @@ -159,13 +159,15 @@ void msi_compose_msg(unsigned vector, const cpumask_t *cpu_mask, struct msi_msg
>  
>      if ( cpu_mask )
>      {
> -        cpumask_t *mask = this_cpu(scratch_cpumask);
> +        cpumask_t *mask;
>  
>          if ( !cpumask_intersects(cpu_mask, &cpu_online_map) )
>              return;
>  
> +        mask = get_scratch_cpumask();
>          cpumask_and(mask, cpu_mask, &cpu_online_map);
>          msg->dest32 = cpu_mask_to_apicid(mask);
> +        put_scratch_cpumask();
>      }

This, I think, could do with a little more changing:

    if ( cpu_mask )
    {
        cpumask_t *mask = get_scratch_cpumask();

        cpumask_and(mask, cpu_mask, &cpu_online_map);
        if ( !cpumask_empty(mask) )
            msg->dest32 = cpu_mask_to_apicid(mask);
        put_scratch_cpumask();
    }

This way instead of looking twice at two cpumask_t instances, the
2nd one involves just one. Thoughts?

> --- a/xen/arch/x86/smp.c
> +++ b/xen/arch/x86/smp.c
> @@ -25,6 +25,31 @@
>  #include <irq_vectors.h>
>  #include <mach_apic.h>
>  
> +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, scratch_cpumask);
> +
> +#ifndef NDEBUG
> +cpumask_t *scratch_cpumask(bool use)
> +{
> +    static DEFINE_PER_CPU(void *, scratch_cpumask_use);
> +
> +    /*
> +     * Due to reentrancy scratch cpumask cannot be used in IRQ, #MC or #NMI
> +     * context.
> +     */
> +    BUG_ON(in_irq() || in_mc() || in_nmi());
> +
> +    if ( use && unlikely(this_cpu(scratch_cpumask_use)) )
> +    {
> +        printk("%p: scratch CPU mask already in use by %p\n",
> +               __builtin_return_address(0), this_cpu(scratch_cpumask_use));

__builtin_return_address(0) simply shows another time what ...

> +        BUG();

... this already will show. I'd suggest to drop it. Also I think
you want %ps here.

Jan
Roger Pau Monné Feb. 27, 2020, 5:54 p.m. UTC | #2
On Wed, Feb 26, 2020 at 11:36:52AM +0100, Jan Beulich wrote:
> On 24.02.2020 11:46, Roger Pau Monne wrote:
> > Current usage of the per-CPU scratch cpumask is dangerous since
> > there's no way to figure out if the mask is already being used except
> > for manual code inspection of all the callers and possible call paths.
> > 
> > This is unsafe and not reliable, so introduce a minimal get/put
> > infrastructure to prevent nested usage of the scratch mask and usage
> > in interrupt context.
> 
> While I can see the reasoning (especially in light of the change
> which did violate to assumption), I'm still uncertain if this isn't
> "over-engineering". Andrew, do you have a clear opinion one way or
> the other here?
> 
> > Move the declaration of scratch_cpumask to smp.c in order to place the
> > declaration and the accessors as close as possible.
> 
> s/declaration/definition/g

Done.

> > --- a/xen/arch/x86/irq.c
> > +++ b/xen/arch/x86/irq.c
> > @@ -196,7 +196,7 @@ static void _clear_irq_vector(struct irq_desc *desc)
> >  {
> >      unsigned int cpu, old_vector, irq = desc->irq;
> >      unsigned int vector = desc->arch.vector;
> > -    cpumask_t *tmp_mask = this_cpu(scratch_cpumask);
> > +    cpumask_t *tmp_mask = get_scratch_cpumask();
> >  
> >      BUG_ON(!valid_irq_vector(vector));
> >  
> > @@ -223,7 +223,10 @@ static void _clear_irq_vector(struct irq_desc *desc)
> >      trace_irq_mask(TRC_HW_IRQ_CLEAR_VECTOR, irq, vector, tmp_mask);
> >  
> >      if ( likely(!desc->arch.move_in_progress) )
> > +    {
> > +        put_scratch_cpumask();
> >          return;
> > +    }
> 
> I think if possible such error path adjustments would better be
> avoided. And this seems feasible here: There are two entirely
> independent used of the scratch mask in this function. You could
> therefore put the mask above from here, and get it again further
> down, or you could leverage a property of the current
> implementation, plus the fact that the 2nd use doesn't involved
> any "real" function calls, and avoid a 2nd get/put altogether.

No, it's very easy to add function calls later on and forget to use
get_scratch_cpumask.

> Of course another question then is whether it is a good property
> of the current model, i.e. whether it wouldn't be better for
> "put" to actually zap the pointer, to prevent subsequent use.

So that put_scratch_cpumask takes the pointer as a parameter and
writes NULL to it?

> > @@ -2531,12 +2536,12 @@ void fixup_irqs(const cpumask_t *mask, bool verbose)
> >      unsigned int irq;
> >      static int warned;
> >      struct irq_desc *desc;
> > +    cpumask_t *affinity = get_scratch_cpumask();
> >  
> >      for ( irq = 0; irq < nr_irqs; irq++ )
> >      {
> >          bool break_affinity = false, set_affinity = true;
> >          unsigned int vector;
> > -        cpumask_t *affinity = this_cpu(scratch_cpumask);
> >  
> >          if ( irq == 2 )
> >              continue;
> > @@ -2640,6 +2645,8 @@ void fixup_irqs(const cpumask_t *mask, bool verbose)
> >                     irq, CPUMASK_PR(affinity));
> >      }
> >  
> > +    put_scratch_cpumask();
> 
> Just as a remark, not necessarily as a request to change the code: I
> wonder if down the road this pretty wide scope of "holding" the mask
> isn't going to bite us, when a function called from here (in a range
> of code not actively needing the mask) also may want to use the mask.
> But of course we can make this finer grained at the point where it
> might actually start mattering.

We can always reduce the scope if there's a need for it, until then I
would rather leave this as-is.

> 
> > @@ -3645,12 +3647,17 @@ long do_mmuext_op(
> >                                     mask)) )
> >                  rc = -EINVAL;
> >              if ( unlikely(rc) )
> > +            {
> > +                put_scratch_cpumask();
> >                  break;
> > +            }
> 
> Again, instead of adjusting an error path, how about making this
> have an empty statement (i.e. dropping the break) and ...
> 
> >              if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
> 
> ... having this become "else if()"?
> 
> > @@ -4384,6 +4393,9 @@ static int __do_update_va_mapping(
> >          break;
> >      }
> >  
> > +    if ( mask && mask != d->dirty_cpumask )
> > +        put_scratch_cpumask();
> 
> The right side of the && here makes things feel a little fragile for
> me.

What about using:

switch ( flags & ~UVMF_FLUSHTYPE_MASK )
{
case UVMF_LOCAL:
case UVMF_ALL:
    break;

default:
    put_scratch_cpumask();
}

I could also use an if, but I think it's clearer with a switch.

> > --- a/xen/arch/x86/msi.c
> > +++ b/xen/arch/x86/msi.c
> > @@ -159,13 +159,15 @@ void msi_compose_msg(unsigned vector, const cpumask_t *cpu_mask, struct msi_msg
> >  
> >      if ( cpu_mask )
> >      {
> > -        cpumask_t *mask = this_cpu(scratch_cpumask);
> > +        cpumask_t *mask;
> >  
> >          if ( !cpumask_intersects(cpu_mask, &cpu_online_map) )
> >              return;
> >  
> > +        mask = get_scratch_cpumask();
> >          cpumask_and(mask, cpu_mask, &cpu_online_map);
> >          msg->dest32 = cpu_mask_to_apicid(mask);
> > +        put_scratch_cpumask();
> >      }
> 
> This, I think, could do with a little more changing:
> 
>     if ( cpu_mask )
>     {
>         cpumask_t *mask = get_scratch_cpumask();
> 
>         cpumask_and(mask, cpu_mask, &cpu_online_map);
>         if ( !cpumask_empty(mask) )
>             msg->dest32 = cpu_mask_to_apicid(mask);
>         put_scratch_cpumask();
>     }
> 
> This way instead of looking twice at two cpumask_t instances, the
> 2nd one involves just one. Thoughts?

LGTM.

Note that this won't exit early however in case masks don't intersect,
and will set the address field with whatever is in msg->dest32.

> > --- a/xen/arch/x86/smp.c
> > +++ b/xen/arch/x86/smp.c
> > @@ -25,6 +25,31 @@
> >  #include <irq_vectors.h>
> >  #include <mach_apic.h>
> >  
> > +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, scratch_cpumask);
> > +
> > +#ifndef NDEBUG
> > +cpumask_t *scratch_cpumask(bool use)
> > +{
> > +    static DEFINE_PER_CPU(void *, scratch_cpumask_use);
> > +
> > +    /*
> > +     * Due to reentrancy scratch cpumask cannot be used in IRQ, #MC or #NMI
> > +     * context.
> > +     */
> > +    BUG_ON(in_irq() || in_mc() || in_nmi());
> > +
> > +    if ( use && unlikely(this_cpu(scratch_cpumask_use)) )
> > +    {
> > +        printk("%p: scratch CPU mask already in use by %p\n",
> > +               __builtin_return_address(0), this_cpu(scratch_cpumask_use));
> 
> __builtin_return_address(0) simply shows another time what ...
> 
> > +        BUG();
> 
> ... this already will show. I'd suggest to drop it. Also I think
> you want %ps here.

Done, thanks.

Roger.
Jan Beulich Feb. 28, 2020, 8:48 a.m. UTC | #3
On 27.02.2020 18:54, Roger Pau Monné wrote:
> On Wed, Feb 26, 2020 at 11:36:52AM +0100, Jan Beulich wrote:
>> On 24.02.2020 11:46, Roger Pau Monne wrote:
>>> --- a/xen/arch/x86/irq.c
>>> +++ b/xen/arch/x86/irq.c
>>> @@ -196,7 +196,7 @@ static void _clear_irq_vector(struct irq_desc *desc)
>>>  {
>>>      unsigned int cpu, old_vector, irq = desc->irq;
>>>      unsigned int vector = desc->arch.vector;
>>> -    cpumask_t *tmp_mask = this_cpu(scratch_cpumask);
>>> +    cpumask_t *tmp_mask = get_scratch_cpumask();
>>>  
>>>      BUG_ON(!valid_irq_vector(vector));
>>>  
>>> @@ -223,7 +223,10 @@ static void _clear_irq_vector(struct irq_desc *desc)
>>>      trace_irq_mask(TRC_HW_IRQ_CLEAR_VECTOR, irq, vector, tmp_mask);
>>>  
>>>      if ( likely(!desc->arch.move_in_progress) )
>>> +    {
>>> +        put_scratch_cpumask();
>>>          return;
>>> +    }
>>
>> I think if possible such error path adjustments would better be
>> avoided. And this seems feasible here: There are two entirely
>> independent used of the scratch mask in this function. You could
>> therefore put the mask above from here, and get it again further
>> down, or you could leverage a property of the current
>> implementation, plus the fact that the 2nd use doesn't involved
>> any "real" function calls, and avoid a 2nd get/put altogether.
> 
> No, it's very easy to add function calls later on and forget to use
> get_scratch_cpumask.

Well, yes, such a deliberate omission would of course require a
bold comment. 

>> Of course another question then is whether it is a good property
>> of the current model, i.e. whether it wouldn't be better for
>> "put" to actually zap the pointer, to prevent subsequent use.
> 
> So that put_scratch_cpumask takes the pointer as a parameter and
> writes NULL to it?

For example, yes.

>>> @@ -3645,12 +3647,17 @@ long do_mmuext_op(
>>>                                     mask)) )
>>>                  rc = -EINVAL;
>>>              if ( unlikely(rc) )
>>> +            {
>>> +                put_scratch_cpumask();
>>>                  break;
>>> +            }
>>
>> Again, instead of adjusting an error path, how about making this
>> have an empty statement (i.e. dropping the break) and ...
>>
>>>              if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
>>
>> ... having this become "else if()"?
>>
>>> @@ -4384,6 +4393,9 @@ static int __do_update_va_mapping(
>>>          break;
>>>      }
>>>  
>>> +    if ( mask && mask != d->dirty_cpumask )
>>> +        put_scratch_cpumask();
>>
>> The right side of the && here makes things feel a little fragile for
>> me.
> 
> What about using:
> 
> switch ( flags & ~UVMF_FLUSHTYPE_MASK )
> {
> case UVMF_LOCAL:
> case UVMF_ALL:
>     break;
> 
> default:
>     put_scratch_cpumask();
> }

Fine with me.

> I could also use an if, but I think it's clearer with a switch.

Agreed.

>>> --- a/xen/arch/x86/msi.c
>>> +++ b/xen/arch/x86/msi.c
>>> @@ -159,13 +159,15 @@ void msi_compose_msg(unsigned vector, const cpumask_t *cpu_mask, struct msi_msg
>>>  
>>>      if ( cpu_mask )
>>>      {
>>> -        cpumask_t *mask = this_cpu(scratch_cpumask);
>>> +        cpumask_t *mask;
>>>  
>>>          if ( !cpumask_intersects(cpu_mask, &cpu_online_map) )
>>>              return;
>>>  
>>> +        mask = get_scratch_cpumask();
>>>          cpumask_and(mask, cpu_mask, &cpu_online_map);
>>>          msg->dest32 = cpu_mask_to_apicid(mask);
>>> +        put_scratch_cpumask();
>>>      }
>>
>> This, I think, could do with a little more changing:
>>
>>     if ( cpu_mask )
>>     {
>>         cpumask_t *mask = get_scratch_cpumask();
>>
>>         cpumask_and(mask, cpu_mask, &cpu_online_map);
>>         if ( !cpumask_empty(mask) )
>>             msg->dest32 = cpu_mask_to_apicid(mask);
>>         put_scratch_cpumask();
>>     }
>>
>> This way instead of looking twice at two cpumask_t instances, the
>> 2nd one involves just one. Thoughts?
> 
> LGTM.
> 
> Note that this won't exit early however in case masks don't intersect,
> and will set the address field with whatever is in msg->dest32.

Oh, I should have noticed this. No, the early exit has to remain
one way or another. I guess I'm fine then with the original
variant.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
index e98e08e9c8..4ee261b632 100644
--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -2236,10 +2236,11 @@  int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
     entry.vector = vector;
 
     if (cpumask_intersects(desc->arch.cpu_mask, TARGET_CPUS)) {
-        cpumask_t *mask = this_cpu(scratch_cpumask);
+        cpumask_t *mask = get_scratch_cpumask();
 
         cpumask_and(mask, desc->arch.cpu_mask, TARGET_CPUS);
         SET_DEST(entry, logical, cpu_mask_to_apicid(mask));
+        put_scratch_cpumask();
     } else {
         printk(XENLOG_ERR "IRQ%d: no target CPU (%*pb vs %*pb)\n",
                irq, CPUMASK_PR(desc->arch.cpu_mask), CPUMASK_PR(TARGET_CPUS));
@@ -2433,10 +2434,11 @@  int ioapic_guest_write(unsigned long physbase, unsigned int reg, u32 val)
 
     if ( cpumask_intersects(desc->arch.cpu_mask, TARGET_CPUS) )
     {
-        cpumask_t *mask = this_cpu(scratch_cpumask);
+        cpumask_t *mask = get_scratch_cpumask();
 
         cpumask_and(mask, desc->arch.cpu_mask, TARGET_CPUS);
         SET_DEST(rte, logical, cpu_mask_to_apicid(mask));
+        put_scratch_cpumask();
     }
     else
     {
diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c
index cc2eb8e925..7ecf5376e3 100644
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -196,7 +196,7 @@  static void _clear_irq_vector(struct irq_desc *desc)
 {
     unsigned int cpu, old_vector, irq = desc->irq;
     unsigned int vector = desc->arch.vector;
-    cpumask_t *tmp_mask = this_cpu(scratch_cpumask);
+    cpumask_t *tmp_mask = get_scratch_cpumask();
 
     BUG_ON(!valid_irq_vector(vector));
 
@@ -223,7 +223,10 @@  static void _clear_irq_vector(struct irq_desc *desc)
     trace_irq_mask(TRC_HW_IRQ_CLEAR_VECTOR, irq, vector, tmp_mask);
 
     if ( likely(!desc->arch.move_in_progress) )
+    {
+        put_scratch_cpumask();
         return;
+    }
 
     /* If we were in motion, also clear desc->arch.old_vector */
     old_vector = desc->arch.old_vector;
@@ -236,6 +239,7 @@  static void _clear_irq_vector(struct irq_desc *desc)
         per_cpu(vector_irq, cpu)[old_vector] = ~irq;
     }
 
+    put_scratch_cpumask();
     release_old_vec(desc);
 
     desc->arch.move_in_progress = 0;
@@ -1152,10 +1156,11 @@  static void irq_guest_eoi_timer_fn(void *data)
         break;
 
     case ACKTYPE_EOI:
-        cpu_eoi_map = this_cpu(scratch_cpumask);
+        cpu_eoi_map = get_scratch_cpumask();
         cpumask_copy(cpu_eoi_map, action->cpu_eoi_map);
         spin_unlock_irq(&desc->lock);
         on_selected_cpus(cpu_eoi_map, set_eoi_ready, desc, 0);
+        put_scratch_cpumask();
         return;
     }
 
@@ -2531,12 +2536,12 @@  void fixup_irqs(const cpumask_t *mask, bool verbose)
     unsigned int irq;
     static int warned;
     struct irq_desc *desc;
+    cpumask_t *affinity = get_scratch_cpumask();
 
     for ( irq = 0; irq < nr_irqs; irq++ )
     {
         bool break_affinity = false, set_affinity = true;
         unsigned int vector;
-        cpumask_t *affinity = this_cpu(scratch_cpumask);
 
         if ( irq == 2 )
             continue;
@@ -2640,6 +2645,8 @@  void fixup_irqs(const cpumask_t *mask, bool verbose)
                    irq, CPUMASK_PR(affinity));
     }
 
+    put_scratch_cpumask();
+
     /* That doesn't seem sufficient.  Give it 1ms. */
     local_irq_enable();
     mdelay(1);
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 70b87c4830..0320a9ad98 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1262,7 +1262,7 @@  void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
              (l1e_owner == pg_owner) )
         {
             struct vcpu *v;
-            cpumask_t *mask = this_cpu(scratch_cpumask);
+            cpumask_t *mask = get_scratch_cpumask();
 
             cpumask_clear(mask);
 
@@ -1279,6 +1279,7 @@  void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
 
             if ( !cpumask_empty(mask) )
                 flush_tlb_mask(mask);
+            put_scratch_cpumask();
         }
 #endif /* CONFIG_PV_LDT_PAGING */
         put_page(page);
@@ -2903,7 +2904,7 @@  static int _get_page_type(struct page_info *page, unsigned long type,
                  * vital that no other CPUs are left with mappings of a frame
                  * which is about to become writeable to the guest.
                  */
-                cpumask_t *mask = this_cpu(scratch_cpumask);
+                cpumask_t *mask = get_scratch_cpumask();
 
                 BUG_ON(in_irq());
                 cpumask_copy(mask, d->dirty_cpumask);
@@ -2919,6 +2920,7 @@  static int _get_page_type(struct page_info *page, unsigned long type,
                     perfc_incr(need_flush_tlb_flush);
                     flush_tlb_mask(mask);
                 }
+                put_scratch_cpumask();
 
                 /* We lose existing type and validity. */
                 nx &= ~(PGT_type_mask | PGT_validated);
@@ -3635,7 +3637,7 @@  long do_mmuext_op(
         case MMUEXT_TLB_FLUSH_MULTI:
         case MMUEXT_INVLPG_MULTI:
         {
-            cpumask_t *mask = this_cpu(scratch_cpumask);
+            cpumask_t *mask = get_scratch_cpumask();
 
             if ( unlikely(currd != pg_owner) )
                 rc = -EPERM;
@@ -3645,12 +3647,17 @@  long do_mmuext_op(
                                    mask)) )
                 rc = -EINVAL;
             if ( unlikely(rc) )
+            {
+                put_scratch_cpumask();
                 break;
+            }
 
             if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
                 flush_tlb_mask(mask);
             else if ( __addr_ok(op.arg1.linear_addr) )
                 flush_tlb_one_mask(mask, op.arg1.linear_addr);
+            put_scratch_cpumask();
+
             break;
         }
 
@@ -3683,7 +3690,7 @@  long do_mmuext_op(
             else if ( likely(cache_flush_permitted(currd)) )
             {
                 unsigned int cpu;
-                cpumask_t *mask = this_cpu(scratch_cpumask);
+                cpumask_t *mask = get_scratch_cpumask();
 
                 cpumask_clear(mask);
                 for_each_online_cpu(cpu)
@@ -3691,6 +3698,7 @@  long do_mmuext_op(
                                              per_cpu(cpu_sibling_mask, cpu)) )
                         __cpumask_set_cpu(cpu, mask);
                 flush_mask(mask, FLUSH_CACHE);
+                put_scratch_cpumask();
             }
             else
                 rc = -EINVAL;
@@ -4156,12 +4164,13 @@  long do_mmu_update(
          * Force other vCPU-s of the affected guest to pick up L4 entry
          * changes (if any).
          */
-        unsigned int cpu = smp_processor_id();
-        cpumask_t *mask = per_cpu(scratch_cpumask, cpu);
+        cpumask_t *mask = get_scratch_cpumask();
 
-        cpumask_andnot(mask, pt_owner->dirty_cpumask, cpumask_of(cpu));
+        cpumask_andnot(mask, pt_owner->dirty_cpumask,
+                       cpumask_of(smp_processor_id()));
         if ( !cpumask_empty(mask) )
             flush_mask(mask, FLUSH_TLB_GLOBAL | FLUSH_ROOT_PGTBL);
+        put_scratch_cpumask();
     }
 
     perfc_add(num_page_updates, i);
@@ -4353,7 +4362,7 @@  static int __do_update_va_mapping(
             mask = d->dirty_cpumask;
             break;
         default:
-            mask = this_cpu(scratch_cpumask);
+            mask = get_scratch_cpumask();
             rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
                                                                      void),
                                       mask);
@@ -4373,7 +4382,7 @@  static int __do_update_va_mapping(
             mask = d->dirty_cpumask;
             break;
         default:
-            mask = this_cpu(scratch_cpumask);
+            mask = get_scratch_cpumask();
             rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
                                                                      void),
                                       mask);
@@ -4384,6 +4393,9 @@  static int __do_update_va_mapping(
         break;
     }
 
+    if ( mask && mask != d->dirty_cpumask )
+        put_scratch_cpumask();
+
     return rc;
 }
 
diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c
index 161ee60dbe..6624ea20d0 100644
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -159,13 +159,15 @@  void msi_compose_msg(unsigned vector, const cpumask_t *cpu_mask, struct msi_msg
 
     if ( cpu_mask )
     {
-        cpumask_t *mask = this_cpu(scratch_cpumask);
+        cpumask_t *mask;
 
         if ( !cpumask_intersects(cpu_mask, &cpu_online_map) )
             return;
 
+        mask = get_scratch_cpumask();
         cpumask_and(mask, cpu_mask, &cpu_online_map);
         msg->dest32 = cpu_mask_to_apicid(mask);
+        put_scratch_cpumask();
     }
 
     msg->address_hi = MSI_ADDR_BASE_HI;
diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
index 53e0de2a70..4d9640d135 100644
--- a/xen/arch/x86/smp.c
+++ b/xen/arch/x86/smp.c
@@ -25,6 +25,31 @@ 
 #include <irq_vectors.h>
 #include <mach_apic.h>
 
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, scratch_cpumask);
+
+#ifndef NDEBUG
+cpumask_t *scratch_cpumask(bool use)
+{
+    static DEFINE_PER_CPU(void *, scratch_cpumask_use);
+
+    /*
+     * Due to reentrancy scratch cpumask cannot be used in IRQ, #MC or #NMI
+     * context.
+     */
+    BUG_ON(in_irq() || in_mc() || in_nmi());
+
+    if ( use && unlikely(this_cpu(scratch_cpumask_use)) )
+    {
+        printk("%p: scratch CPU mask already in use by %p\n",
+               __builtin_return_address(0), this_cpu(scratch_cpumask_use));
+        BUG();
+    }
+    this_cpu(scratch_cpumask_use) = use ? __builtin_return_address(0) : NULL;
+
+    return use ? this_cpu(scratch_cpumask) : NULL;
+}
+#endif
+
 /* Helper functions to prepare APIC register values. */
 static unsigned int prepare_ICR(unsigned int shortcut, int vector)
 {
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 82e89201b3..a2ac3adb38 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -54,7 +54,6 @@  DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_mask);
 /* representing HT and core siblings of each logical CPU */
 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_mask);
 
-DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, scratch_cpumask);
 static cpumask_t scratch_cpu0mask;
 
 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, send_ipi_cpumask);
diff --git a/xen/include/asm-x86/smp.h b/xen/include/asm-x86/smp.h
index 92d69a5ea0..40ab6c251d 100644
--- a/xen/include/asm-x86/smp.h
+++ b/xen/include/asm-x86/smp.h
@@ -23,6 +23,16 @@  DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_mask);
 DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask);
 DECLARE_PER_CPU(cpumask_var_t, scratch_cpumask);
 
+#ifndef NDEBUG
+/* Not to be called directly, use {get/put}_scratch_cpumask(). */
+cpumask_t *scratch_cpumask(bool use);
+#define get_scratch_cpumask() scratch_cpumask(true)
+#define put_scratch_cpumask() ((void)scratch_cpumask(false))
+#else
+#define get_scratch_cpumask() this_cpu(scratch_cpumask)
+#define put_scratch_cpumask()
+#endif
+
 /*
  * Do we, for platform reasons, need to actually keep CPUs online when we
  * would otherwise prefer them to be off?