diff mbox series

[v1b,1/9] x86/IRQ: deal with move-in-progress state in fixup_irqs()

Message ID 5CC71ADE020000780022A1B7@prv1-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show
Series [v1b,1/9] x86/IRQ: deal with move-in-progress state in fixup_irqs() | expand

Commit Message

Jan Beulich April 29, 2019, 3:40 p.m. UTC
The flag being set may prevent affinity changes, as these often imply
assignment of a new vector. When there's no possible destination left
for the IRQ, the clearing of the flag needs to happen right from
fixup_irqs().

Additionally _assign_irq_vector() needs to avoid setting the flag when
there's no online CPU left in what gets put into ->arch.old_cpu_mask.
The old vector can be released right away in this case.

Also extend the log message about broken affinity to include the new
affinity as well, allowing to notice issues with affinity changes not
actually having taken place. Swap the if/else-if order there at the
same time to reduce the amount of conditions checked.

At the same time replace two open coded instances of the new helper
function.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Also update vector_irq[] in the code added to fixup_irqs().

Comments

Roger Pau Monné May 3, 2019, 9:19 a.m. UTC | #1
On Mon, Apr 29, 2019 at 09:40:14AM -0600, Jan Beulich wrote:
> The flag being set may prevent affinity changes, as these often imply
> assignment of a new vector. When there's no possible destination left
> for the IRQ, the clearing of the flag needs to happen right from
> fixup_irqs().
> 
> Additionally _assign_irq_vector() needs to avoid setting the flag when
> there's no online CPU left in what gets put into ->arch.old_cpu_mask.
> The old vector can be released right away in this case.
> 
> Also extend the log message about broken affinity to include the new
> affinity as well, allowing to notice issues with affinity changes not
> actually having taken place. Swap the if/else-if order there at the
> same time to reduce the amount of conditions checked.
> 
> At the same time replace two open coded instances of the new helper
> function.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v2: Also update vector_irq[] in the code added to fixup_irqs().
> 
> --- unstable.orig/xen/arch/x86/irq.c	2019-04-29 17:34:16.726542659 +0200
> +++ unstable/xen/arch/x86/irq.c	2019-04-29 15:05:39.000000000 +0200
> @@ -242,6 +242,20 @@ void destroy_irq(unsigned int irq)
>      xfree(action);
>  }
>  
> +static void release_old_vec(struct irq_desc *desc)
> +{
> +    unsigned int vector = desc->arch.old_vector;
> +
> +    desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
> +    cpumask_clear(desc->arch.old_cpu_mask);
> +
> +    if ( desc->arch.used_vectors )

Wouldn't it be better to clean the bitmap when vector !=
IRQ_VECTOR_UNASSIGNED?

I haven't checked all the callers, but I don't think it's valid to
call release_old_vec with desc->arch.old_vector ==
IRQ_VECTOR_UNASSIGNED, in which case I would add an ASSERT.

> +    {
> +        ASSERT(test_bit(vector, desc->arch.used_vectors));
> +        clear_bit(vector, desc->arch.used_vectors);
> +    }
> +}
> +
>  static void __clear_irq_vector(int irq)
>  {
>      int cpu, vector, old_vector;
> @@ -285,14 +299,7 @@ static void __clear_irq_vector(int irq)

Kind of unrelated, but I think the check at the top of
__clear_irq_vector should be:

BUG_ON(desc->arch.vector == IRQ_VECTOR_UNASSIGNED);

Rather than the current:

BUG_ON(!desc->arch.vector);

There's a lot of logic that would go extremely wrong if vector is -1.

>          per_cpu(vector_irq, cpu)[old_vector] = ~irq;
>      }
>  
> -    desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
> -    cpumask_clear(desc->arch.old_cpu_mask);
> -
> -    if ( desc->arch.used_vectors )
> -    {
> -        ASSERT(test_bit(old_vector, desc->arch.used_vectors));
> -        clear_bit(old_vector, desc->arch.used_vectors);
> -    }
> +    release_old_vec(desc);
>  
>      desc->arch.move_in_progress = 0;

While there it might be nice to convert move_in_progress to a boolean.

>  }
> @@ -517,12 +524,21 @@ next:
>          /* Found one! */
>          current_vector = vector;
>          current_offset = offset;
> -        if (old_vector > 0) {
> -            desc->arch.move_in_progress = 1;
> -            cpumask_copy(desc->arch.old_cpu_mask, desc->arch.cpu_mask);
> +
> +        if ( old_vector > 0 )
> +        {
> +            cpumask_and(desc->arch.old_cpu_mask, desc->arch.cpu_mask,
> +                        &cpu_online_map);
>              desc->arch.old_vector = desc->arch.vector;
> +            if ( !cpumask_empty(desc->arch.old_cpu_mask) )
> +                desc->arch.move_in_progress = 1;
> +            else
> +                /* This can happen while offlining a CPU. */
> +                release_old_vec(desc);
>          }
> +
>          trace_irq_mask(TRC_HW_IRQ_ASSIGN_VECTOR, irq, vector, &tmp_mask);
> +
>          for_each_cpu(new_cpu, &tmp_mask)
>              per_cpu(vector_irq, new_cpu)[vector] = irq;
>          desc->arch.vector = vector;
> @@ -691,14 +707,8 @@ void irq_move_cleanup_interrupt(struct c
>  
>          if ( desc->arch.move_cleanup_count == 0 )
>          {
> -            desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
> -            cpumask_clear(desc->arch.old_cpu_mask);
> -
> -            if ( desc->arch.used_vectors )
> -            {
> -                ASSERT(test_bit(vector, desc->arch.used_vectors));
> -                clear_bit(vector, desc->arch.used_vectors);
> -            }
> +            ASSERT(vector == desc->arch.old_vector);
> +            release_old_vec(desc);
>          }
>  unlock:
>          spin_unlock(&desc->lock);
> @@ -2391,6 +2401,33 @@ void fixup_irqs(const cpumask_t *mask, b
>              continue;
>          }
>  
> +        /*
> +         * In order for the affinity adjustment below to be successful, we
> +         * need __assign_irq_vector() to succeed. This in particular means
> +         * clearing desc->arch.move_in_progress if this would otherwise
> +         * prevent the function from succeeding. Since there's no way for the
> +         * flag to get cleared anymore when there's no possible destination
> +         * left (the only possibility then would be the IRQs enabled window
> +         * after this loop), there's then also no race with us doing it here.
> +         *
> +         * Therefore the logic here and there need to remain in sync.
> +         */
> +        if ( desc->arch.move_in_progress &&
> +             !cpumask_intersects(mask, desc->arch.cpu_mask) )
> +        {
> +            unsigned int cpu;
> +
> +            cpumask_and(&affinity, desc->arch.old_cpu_mask, &cpu_online_map);
> +
> +            spin_lock(&vector_lock);
> +            for_each_cpu(cpu, &affinity)
> +                per_cpu(vector_irq, cpu)[desc->arch.old_vector] = ~irq;
> +            spin_unlock(&vector_lock);
> +
> +            release_old_vec(desc);
> +            desc->arch.move_in_progress = 0;
> +        }
> +
>          cpumask_and(&affinity, &affinity, mask);
>          if ( cpumask_empty(&affinity) )
>          {
> @@ -2409,15 +2446,18 @@ void fixup_irqs(const cpumask_t *mask, b
>          if ( desc->handler->enable )
>              desc->handler->enable(desc);
>  
> +        cpumask_copy(&affinity, desc->affinity);
> +
>          spin_unlock(&desc->lock);
>  
>          if ( !verbose )
>              continue;
>  
> -        if ( break_affinity && set_affinity )
> -            printk("Broke affinity for irq %i\n", irq);
> -        else if ( !set_affinity )
> -            printk("Cannot set affinity for irq %i\n", irq);
> +        if ( !set_affinity )
> +            printk("Cannot set affinity for IRQ%u\n", irq);
> +        else if ( break_affinity )
> +            printk("Broke affinity for IRQ%u, new: %*pb\n",
> +                   irq, nr_cpu_ids, &affinity);

I guess it's fine to have those without rate-limiting because
fixup_irqs is only called for admin-triggered actions, so there's no
risk of console flooding.

Thanks, Roger.
Jan Beulich May 3, 2019, 2:10 p.m. UTC | #2
>>> On 03.05.19 at 11:19, <roger.pau@citrix.com> wrote:
> On Mon, Apr 29, 2019 at 09:40:14AM -0600, Jan Beulich wrote:
>> --- unstable.orig/xen/arch/x86/irq.c	
>> +++ unstable/xen/arch/x86/irq.c
>> @@ -242,6 +242,20 @@ void destroy_irq(unsigned int irq)
>>      xfree(action);
>>  }
>>  
>> +static void release_old_vec(struct irq_desc *desc)
>> +{
>> +    unsigned int vector = desc->arch.old_vector;
>> +
>> +    desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
>> +    cpumask_clear(desc->arch.old_cpu_mask);
>> +
>> +    if ( desc->arch.used_vectors )
> 
> Wouldn't it be better to clean the bitmap when vector !=
> IRQ_VECTOR_UNASSIGNED?

No code path does / should call into here without the need to
actually release the previous vector.

> I haven't checked all the callers, but I don't think it's valid to
> call release_old_vec with desc->arch.old_vector ==
> IRQ_VECTOR_UNASSIGNED, in which case I would add an ASSERT.

Well, yes, I probably could. However, as much as I'm in
favor of ASSERT()s, I don't think it makes sense to ASSERT()
basically every bit of expected state. In the end there would
otherwise be more ASSERT()s than actual code.

>> +    {
>> +        ASSERT(test_bit(vector, desc->arch.used_vectors));
>> +        clear_bit(vector, desc->arch.used_vectors);
>> +    }
>> +}
>> +
>>  static void __clear_irq_vector(int irq)
>>  {
>>      int cpu, vector, old_vector;
>> @@ -285,14 +299,7 @@ static void __clear_irq_vector(int irq)
> 
> Kind of unrelated, but I think the check at the top of
> __clear_irq_vector should be:
> 
> BUG_ON(desc->arch.vector == IRQ_VECTOR_UNASSIGNED);
> 
> Rather than the current:
> 
> BUG_ON(!desc->arch.vector);
> 
> There's a lot of logic that would go extremely wrong if vector is -1.

Yes indeed. Do you want to send a patch, or should I add
one at the end of this series?

>>          per_cpu(vector_irq, cpu)[old_vector] = ~irq;
>>      }
>>  
>> -    desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
>> -    cpumask_clear(desc->arch.old_cpu_mask);
>> -
>> -    if ( desc->arch.used_vectors )
>> -    {
>> -        ASSERT(test_bit(old_vector, desc->arch.used_vectors));
>> -        clear_bit(old_vector, desc->arch.used_vectors);
>> -    }
>> +    release_old_vec(desc);
>>  
>>      desc->arch.move_in_progress = 0;
> 
> While there it might be nice to convert move_in_progress to a boolean.

This would grow the patch quite a bit I think, so I prefer no to.

>> @@ -2409,15 +2446,18 @@ void fixup_irqs(const cpumask_t *mask, b
>>          if ( desc->handler->enable )
>>              desc->handler->enable(desc);
>>  
>> +        cpumask_copy(&affinity, desc->affinity);
>> +
>>          spin_unlock(&desc->lock);
>>  
>>          if ( !verbose )
>>              continue;
>>  
>> -        if ( break_affinity && set_affinity )
>> -            printk("Broke affinity for irq %i\n", irq);
>> -        else if ( !set_affinity )
>> -            printk("Cannot set affinity for irq %i\n", irq);
>> +        if ( !set_affinity )
>> +            printk("Cannot set affinity for IRQ%u\n", irq);
>> +        else if ( break_affinity )
>> +            printk("Broke affinity for IRQ%u, new: %*pb\n",
>> +                   irq, nr_cpu_ids, &affinity);
> 
> I guess it's fine to have those without rate-limiting because
> fixup_irqs is only called for admin-triggered actions, so there's no
> risk of console flooding.

Right, plus I'd rather not hide any of these messages: Them
being there was already a good indication that something
_might_ be going wrong. If we got to the point where we're
fully confident in the code, then we could think about lowering
their log level, or rate limiting them.

Jan
Jan Beulich May 6, 2019, 7:15 a.m. UTC | #3
>>> On 03.05.19 at 16:10, <JBeulich@suse.com> wrote:
>>>> On 03.05.19 at 11:19, <roger.pau@citrix.com> wrote:
>> On Mon, Apr 29, 2019 at 09:40:14AM -0600, Jan Beulich wrote:
>>> --- unstable.orig/xen/arch/x86/irq.c	
>>> +++ unstable/xen/arch/x86/irq.c
>>> @@ -242,6 +242,20 @@ void destroy_irq(unsigned int irq)
>>>      xfree(action);
>>>  }
>>>  
>>> +static void release_old_vec(struct irq_desc *desc)
>>> +{
>>> +    unsigned int vector = desc->arch.old_vector;
>>> +
>>> +    desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
>>> +    cpumask_clear(desc->arch.old_cpu_mask);
>>> +
>>> +    if ( desc->arch.used_vectors )
>> 
>> Wouldn't it be better to clean the bitmap when vector !=
>> IRQ_VECTOR_UNASSIGNED?
> 
> No code path does / should call into here without the need to
> actually release the previous vector.
> 
>> I haven't checked all the callers, but I don't think it's valid to
>> call release_old_vec with desc->arch.old_vector ==
>> IRQ_VECTOR_UNASSIGNED, in which case I would add an ASSERT.
> 
> Well, yes, I probably could. However, as much as I'm in
> favor of ASSERT()s, I don't think it makes sense to ASSERT()
> basically every bit of expected state. In the end there would
> otherwise be more ASSERT()s than actual code.

Actually, upon second thought - let me add this, but then in an
even more strict form: Certain very low and very high numbered
vectors are illegal here as well, and we may then be able to use
the same validation helper elsewhere (in particular also for the
check that you've found to be wrong in _clear_irq_vector()).

Jan
Roger Pau Monné May 6, 2019, 2:28 p.m. UTC | #4
On Mon, May 06, 2019 at 01:15:59AM -0600, Jan Beulich wrote:
> >>> On 03.05.19 at 16:10, <JBeulich@suse.com> wrote:
> >>>> On 03.05.19 at 11:19, <roger.pau@citrix.com> wrote:
> >> On Mon, Apr 29, 2019 at 09:40:14AM -0600, Jan Beulich wrote:
> >>> --- unstable.orig/xen/arch/x86/irq.c	
> >>> +++ unstable/xen/arch/x86/irq.c
> >>> @@ -242,6 +242,20 @@ void destroy_irq(unsigned int irq)
> >>>      xfree(action);
> >>>  }
> >>>  
> >>> +static void release_old_vec(struct irq_desc *desc)
> >>> +{
> >>> +    unsigned int vector = desc->arch.old_vector;
> >>> +
> >>> +    desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
> >>> +    cpumask_clear(desc->arch.old_cpu_mask);
> >>> +
> >>> +    if ( desc->arch.used_vectors )
> >> 
> >> Wouldn't it be better to clean the bitmap when vector !=
> >> IRQ_VECTOR_UNASSIGNED?
> > 
> > No code path does / should call into here without the need to
> > actually release the previous vector.
> > 
> >> I haven't checked all the callers, but I don't think it's valid to
> >> call release_old_vec with desc->arch.old_vector ==
> >> IRQ_VECTOR_UNASSIGNED, in which case I would add an ASSERT.
> > 
> > Well, yes, I probably could. However, as much as I'm in
> > favor of ASSERT()s, I don't think it makes sense to ASSERT()
> > basically every bit of expected state. In the end there would
> > otherwise be more ASSERT()s than actual code.
> 
> Actually, upon second thought - let me add this, but then in an
> even more strict form: Certain very low and very high numbered
> vectors are illegal here as well, and we may then be able to use
> the same validation helper elsewhere (in particular also for the
> check that you've found to be wrong in _clear_irq_vector()).

Thanks, that LGTM.

Roger.
Jan Beulich May 6, 2019, 3 p.m. UTC | #5
>>> On 06.05.19 at 16:28, <roger.pau@citrix.com> wrote:
> On Mon, May 06, 2019 at 01:15:59AM -0600, Jan Beulich wrote:
>> >>> On 03.05.19 at 16:10, <JBeulich@suse.com> wrote:
>> >>>> On 03.05.19 at 11:19, <roger.pau@citrix.com> wrote:
>> >> On Mon, Apr 29, 2019 at 09:40:14AM -0600, Jan Beulich wrote:
>> >>> --- unstable.orig/xen/arch/x86/irq.c	
>> >>> +++ unstable/xen/arch/x86/irq.c
>> >>> @@ -242,6 +242,20 @@ void destroy_irq(unsigned int irq)
>> >>>      xfree(action);
>> >>>  }
>> >>>  
>> >>> +static void release_old_vec(struct irq_desc *desc)
>> >>> +{
>> >>> +    unsigned int vector = desc->arch.old_vector;
>> >>> +
>> >>> +    desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
>> >>> +    cpumask_clear(desc->arch.old_cpu_mask);
>> >>> +
>> >>> +    if ( desc->arch.used_vectors )
>> >> 
>> >> Wouldn't it be better to clean the bitmap when vector !=
>> >> IRQ_VECTOR_UNASSIGNED?
>> > 
>> > No code path does / should call into here without the need to
>> > actually release the previous vector.
>> > 
>> >> I haven't checked all the callers, but I don't think it's valid to
>> >> call release_old_vec with desc->arch.old_vector ==
>> >> IRQ_VECTOR_UNASSIGNED, in which case I would add an ASSERT.
>> > 
>> > Well, yes, I probably could. However, as much as I'm in
>> > favor of ASSERT()s, I don't think it makes sense to ASSERT()
>> > basically every bit of expected state. In the end there would
>> > otherwise be more ASSERT()s than actual code.
>> 
>> Actually, upon second thought - let me add this, but then in an
>> even more strict form: Certain very low and very high numbered
>> vectors are illegal here as well, and we may then be able to use
>> the same validation helper elsewhere (in particular also for the
>> check that you've found to be wrong in _clear_irq_vector()).
> 
> Thanks, that LGTM.

And FTR - it _does_ trigger. I'm still struggling to explain why.
The only place where ->arch.move_in_progress gets set is
in _assign_irq_vector(), and the check I've put there for
debugging purposes doesn't trigger, i.e. the vectors put there
into ->arch.old_vector are valid.

Jan
diff mbox series

Patch

--- unstable.orig/xen/arch/x86/irq.c	2019-04-29 17:34:16.726542659 +0200
+++ unstable/xen/arch/x86/irq.c	2019-04-29 15:05:39.000000000 +0200
@@ -242,6 +242,20 @@  void destroy_irq(unsigned int irq)
     xfree(action);
 }
 
+static void release_old_vec(struct irq_desc *desc)
+{
+    unsigned int vector = desc->arch.old_vector;
+
+    desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
+    cpumask_clear(desc->arch.old_cpu_mask);
+
+    if ( desc->arch.used_vectors )
+    {
+        ASSERT(test_bit(vector, desc->arch.used_vectors));
+        clear_bit(vector, desc->arch.used_vectors);
+    }
+}
+
 static void __clear_irq_vector(int irq)
 {
     int cpu, vector, old_vector;
@@ -285,14 +299,7 @@  static void __clear_irq_vector(int irq)
         per_cpu(vector_irq, cpu)[old_vector] = ~irq;
     }
 
-    desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
-    cpumask_clear(desc->arch.old_cpu_mask);
-
-    if ( desc->arch.used_vectors )
-    {
-        ASSERT(test_bit(old_vector, desc->arch.used_vectors));
-        clear_bit(old_vector, desc->arch.used_vectors);
-    }
+    release_old_vec(desc);
 
     desc->arch.move_in_progress = 0;
 }
@@ -517,12 +524,21 @@  next:
         /* Found one! */
         current_vector = vector;
         current_offset = offset;
-        if (old_vector > 0) {
-            desc->arch.move_in_progress = 1;
-            cpumask_copy(desc->arch.old_cpu_mask, desc->arch.cpu_mask);
+
+        if ( old_vector > 0 )
+        {
+            cpumask_and(desc->arch.old_cpu_mask, desc->arch.cpu_mask,
+                        &cpu_online_map);
             desc->arch.old_vector = desc->arch.vector;
+            if ( !cpumask_empty(desc->arch.old_cpu_mask) )
+                desc->arch.move_in_progress = 1;
+            else
+                /* This can happen while offlining a CPU. */
+                release_old_vec(desc);
         }
+
         trace_irq_mask(TRC_HW_IRQ_ASSIGN_VECTOR, irq, vector, &tmp_mask);
+
         for_each_cpu(new_cpu, &tmp_mask)
             per_cpu(vector_irq, new_cpu)[vector] = irq;
         desc->arch.vector = vector;
@@ -691,14 +707,8 @@  void irq_move_cleanup_interrupt(struct c
 
         if ( desc->arch.move_cleanup_count == 0 )
         {
-            desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
-            cpumask_clear(desc->arch.old_cpu_mask);
-
-            if ( desc->arch.used_vectors )
-            {
-                ASSERT(test_bit(vector, desc->arch.used_vectors));
-                clear_bit(vector, desc->arch.used_vectors);
-            }
+            ASSERT(vector == desc->arch.old_vector);
+            release_old_vec(desc);
         }
 unlock:
         spin_unlock(&desc->lock);
@@ -2391,6 +2401,33 @@  void fixup_irqs(const cpumask_t *mask, b
             continue;
         }
 
+        /*
+         * In order for the affinity adjustment below to be successful, we
+         * need __assign_irq_vector() to succeed. This in particular means
+         * clearing desc->arch.move_in_progress if this would otherwise
+         * prevent the function from succeeding. Since there's no way for the
+         * flag to get cleared anymore when there's no possible destination
+         * left (the only possibility then would be the IRQs enabled window
+         * after this loop), there's then also no race with us doing it here.
+         *
+         * Therefore the logic here and there need to remain in sync.
+         */
+        if ( desc->arch.move_in_progress &&
+             !cpumask_intersects(mask, desc->arch.cpu_mask) )
+        {
+            unsigned int cpu;
+
+            cpumask_and(&affinity, desc->arch.old_cpu_mask, &cpu_online_map);
+
+            spin_lock(&vector_lock);
+            for_each_cpu(cpu, &affinity)
+                per_cpu(vector_irq, cpu)[desc->arch.old_vector] = ~irq;
+            spin_unlock(&vector_lock);
+
+            release_old_vec(desc);
+            desc->arch.move_in_progress = 0;
+        }
+
         cpumask_and(&affinity, &affinity, mask);
         if ( cpumask_empty(&affinity) )
         {
@@ -2409,15 +2446,18 @@  void fixup_irqs(const cpumask_t *mask, b
         if ( desc->handler->enable )
             desc->handler->enable(desc);
 
+        cpumask_copy(&affinity, desc->affinity);
+
         spin_unlock(&desc->lock);
 
         if ( !verbose )
             continue;
 
-        if ( break_affinity && set_affinity )
-            printk("Broke affinity for irq %i\n", irq);
-        else if ( !set_affinity )
-            printk("Cannot set affinity for irq %i\n", irq);
+        if ( !set_affinity )
+            printk("Cannot set affinity for IRQ%u\n", irq);
+        else if ( break_affinity )
+            printk("Broke affinity for IRQ%u, new: %*pb\n",
+                   irq, nr_cpu_ids, &affinity);
     }
 
     /* That doesn't seem sufficient.  Give it 1ms. */