diff mbox

x86/apic/x2apic: Share IRQ vector between cluster members only when no cpumask is specified

Message ID 1501531546-23548-1-git-send-email-boris.ostrovsky@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Boris Ostrovsky July 31, 2017, 8:05 p.m. UTC
We have limited number (slightly under NR_DYNAMIC_VECTORS=192) of IRQ
vectors that are available to each processor. Currently, when x2apic
cluster mode is used (which is default), each vector is shared among
all processors in the cluster. With many IRQs (as is the case on systems
with multiple SR-IOV cards) and few clusters (e.g. single socket)
there is a good chance that we will run out of vectors.

This patch tries to decrease vector sharing between processors by
assigning vector to a single processor if the assignment request (via
__assign_irq_vector()) comes without explicitly specifying which
processors are expected to share the interrupt. This typically happens
during boot time (or possibly PCI hotplug) when create_irq(NUMA_NO_NODE)
is called. When __assign_irq_vector() is called from
set_desc_affinity() which provides sharing mask, vector sharing will
continue to be performed, as before.

This patch to some extent mirrors Linux commit d872818dbbee
("x86/apic/x2apic: Use multiple cluster members for the irq destination
only with the explicit affinity").

Note that this change still does not guarantee that we never run out of
vectors. For example, on a single core system we will be effectively
back to the single cluster/socket case of original code.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 xen/arch/x86/acpi/boot.c                     | 2 +-
 xen/arch/x86/apic.c                          | 2 +-
 xen/arch/x86/genapic/delivery.c              | 4 ++--
 xen/arch/x86/genapic/x2apic.c                | 9 +++++++--
 xen/arch/x86/io_apic.c                       | 8 ++++----
 xen/arch/x86/irq.c                           | 4 ++--
 xen/arch/x86/mpparse.c                       | 2 +-
 xen/arch/x86/msi.c                           | 2 +-
 xen/include/asm-x86/genapic.h                | 9 ++++++---
 xen/include/asm-x86/mach-generic/mach_apic.h | 9 +++++----
 10 files changed, 30 insertions(+), 21 deletions(-)

Comments

Jan Beulich Aug. 7, 2017, 8:18 a.m. UTC | #1
>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 07/31/17 10:03 PM >>>
>We have limited number (slightly under NR_DYNAMIC_VECTORS=192) of IRQ
>vectors that are available to each processor. Currently, when x2apic
>cluster mode is used (which is default), each vector is shared among
>all processors in the cluster. With many IRQs (as is the case on systems
>with multiple SR-IOV cards) and few clusters (e.g. single socket)
>there is a good chance that we will run out of vectors.
>
>This patch tries to decrease vector sharing between processors by
>assigning vector to a single processor if the assignment request (via
>__assign_irq_vector()) comes without explicitly specifying which
>processors are expected to share the interrupt. This typically happens
>during boot time (or possibly PCI hotplug) when create_irq(NUMA_NO_NODE)
>is called. When __assign_irq_vector() is called from
>set_desc_affinity() which provides sharing mask, vector sharing will
>continue to be performed, as before.

Wouldn't it be sufficient for people running into vector shortage due to
sharing to specify "x2apic_phys" on the command line?

>@@ -72,9 +73,13 @@ static void __init clustered_apic_check_x2apic(void)
>{
>}
 >
>-static const cpumask_t *vector_allocation_cpumask_x2apic_cluster(int cpu)
>+static const cpumask_t *vector_allocation_cpumask_x2apic_cluster(int cpu,
>+    const cpumask_t *cpumask)
>{
>-    return per_cpu(cluster_cpus, cpu);
>+    if ( cpumask != TARGET_CPUS )

Is a pointer comparison (rather than a content one) here really correct in
the general case?

>+        return per_cpu(cluster_cpus, cpu);
>+    else
>+        return cpumask_of(cpu);

Please avoid the "else" in cases like this.

>--- a/xen/include/asm-x86/mach-generic/mach_apic.h
>+++ b/xen/include/asm-x86/mach-generic/mach_apic.h
>@@ -13,10 +13,11 @@
>#define INT_DELIVERY_MODE (genapic->int_delivery_mode)
>#define INT_DEST_MODE (genapic->int_dest_mode)
>#define TARGET_CPUS      (genapic->target_cpus())
>-#define init_apic_ldr (genapic->init_apic_ldr)
>-#define clustered_apic_check (genapic->clustered_apic_check) 
>-#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
>-#define vector_allocation_cpumask(cpu) (genapic->vector_allocation_cpumask(cpu))
>+#define INIT_APIC_LDR (genapic->init_apic_ldr)
>+#define CLUSTERED_APIC_CHECK (genapic->clustered_apic_check) 
>+#define CPU_MASK_TO_APICID (genapic->cpu_mask_to_apicid)
>+#define VECTOR_ALLOCATION_CPUMASK(cpu, mask) \
>+    (genapic->vector_allocation_cpumask(cpu, mask))
 
I don't see the connection of the change in case of all of these to the purpose
of this patch.

Jan
Boris Ostrovsky Aug. 7, 2017, 2:32 p.m. UTC | #2
On 08/07/2017 04:18 AM, Jan Beulich wrote:
>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 07/31/17 10:03 PM >>>
>> We have limited number (slightly under NR_DYNAMIC_VECTORS=192) of IRQ
>> vectors that are available to each processor. Currently, when x2apic
>> cluster mode is used (which is default), each vector is shared among
>> all processors in the cluster. With many IRQs (as is the case on systems
>> with multiple SR-IOV cards) and few clusters (e.g. single socket)
>> there is a good chance that we will run out of vectors.
>>
>> This patch tries to decrease vector sharing between processors by
>> assigning vector to a single processor if the assignment request (via
>> __assign_irq_vector()) comes without explicitly specifying which
>> processors are expected to share the interrupt. This typically happens
>> during boot time (or possibly PCI hotplug) when create_irq(NUMA_NO_NODE)
>> is called. When __assign_irq_vector() is called from
>> set_desc_affinity() which provides sharing mask, vector sharing will
>> continue to be performed, as before.
> Wouldn't it be sufficient for people running into vector shortage due to
> sharing to specify "x2apic_phys" on the command line?

Yes, it would.

The downside is that an installer (e.g. anaconda) would then have to
figure out which systems need to be installed with this specific option.
Even worse, this problem would show up on systems that have some of the
processors taken offline (in BIOS) so during initial installation there
would be no reason to switch to physical. Or a card might be added after
the installation.

In other words, I think x2apic_phys is more of a workaround in this case.

>
>> @@ -72,9 +73,13 @@ static void __init clustered_apic_check_x2apic(void)
>> {
>> }
>  >
>> -static const cpumask_t *vector_allocation_cpumask_x2apic_cluster(int cpu)
>> +static const cpumask_t *vector_allocation_cpumask_x2apic_cluster(int cpu,
>> +    const cpumask_t *cpumask)
>> {
>> -    return per_cpu(cluster_cpus, cpu);
>> +    if ( cpumask != TARGET_CPUS )
> Is a pointer comparison (rather than a content one) here really correct in
> the general case?

When the caller is using TARGET_CPUS this is an indication that it
explicitly didn't care about sharing (in assign_irq_vector()). A caller
might want to have sharing on and provide a mask that has the same
content as TARGET_CPUS but is stored in a different location. This will
allow vector_allocation_cpumask_x2apic_cluster() to distinguish it from
a "don't care" case.

>
>> +        return per_cpu(cluster_cpus, cpu);
>> +    else
>> +        return cpumask_of(cpu);
> Please avoid the "else" in cases like this.
>
>> --- a/xen/include/asm-x86/mach-generic/mach_apic.h
>> +++ b/xen/include/asm-x86/mach-generic/mach_apic.h
>> @@ -13,10 +13,11 @@
>> #define INT_DELIVERY_MODE (genapic->int_delivery_mode)
>> #define INT_DEST_MODE (genapic->int_dest_mode)
>> #define TARGET_CPUS      (genapic->target_cpus())
>> -#define init_apic_ldr (genapic->init_apic_ldr)
>> -#define clustered_apic_check (genapic->clustered_apic_check) 
>> -#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
>> -#define vector_allocation_cpumask(cpu) (genapic->vector_allocation_cpumask(cpu))
>> +#define INIT_APIC_LDR (genapic->init_apic_ldr)
>> +#define CLUSTERED_APIC_CHECK (genapic->clustered_apic_check) 
>> +#define CPU_MASK_TO_APICID (genapic->cpu_mask_to_apicid)
>> +#define VECTOR_ALLOCATION_CPUMASK(cpu, mask) \
>> +    (genapic->vector_allocation_cpumask(cpu, mask))
>  
> I don't see the connection of the change in case of all of these to the purpose
> of this patch.

I need to include asm-x86/mach-generic/mach_apic.h in x2apic.c and
keeping those 4 as lower-case would mess up static initializers of
apic_x2apic_phys and apic_x2apic_cluster. Besides, switching them to
upper case would make them consistent with (what I think are) similar
definitions of the 3 top macros above.


-boris
Andrew Cooper Aug. 7, 2017, 2:39 p.m. UTC | #3
On 07/08/17 09:18, Jan Beulich wrote:
>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 07/31/17 10:03 PM >>>
>> We have limited number (slightly under NR_DYNAMIC_VECTORS=192) of IRQ
>> vectors that are available to each processor. Currently, when x2apic
>> cluster mode is used (which is default), each vector is shared among
>> all processors in the cluster. With many IRQs (as is the case on systems
>> with multiple SR-IOV cards) and few clusters (e.g. single socket)
>> there is a good chance that we will run out of vectors.
>>
>> This patch tries to decrease vector sharing between processors by
>> assigning vector to a single processor if the assignment request (via
>> __assign_irq_vector()) comes without explicitly specifying which
>> processors are expected to share the interrupt. This typically happens
>> during boot time (or possibly PCI hotplug) when create_irq(NUMA_NO_NODE)
>> is called. When __assign_irq_vector() is called from
>> set_desc_affinity() which provides sharing mask, vector sharing will
>> continue to be performed, as before.
> Wouldn't it be sufficient for people running into vector shortage due to
> sharing to specify "x2apic_phys" on the command line?

Had XenServer noticed this change in default earlier, I would have
insisted that you make x2apic_phys the default.

On affected systems, problem manifests as an upgrade where the new
hypervisor doesn't boot.  Recovering from this requires human
intervention during the server boot.

As it was, the change went unnoticed for several releases upstream, so
I'm not sure if changing the default at this point is a sensible course
of action.

~Andrew
Jan Beulich Aug. 7, 2017, 2:45 p.m. UTC | #4
>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 08/07/17 4:32 PM >>>
>On 08/07/2017 04:18 AM, Jan Beulich wrote:
>>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 07/31/17 10:03 PM >>>
>>> We have limited number (slightly under NR_DYNAMIC_VECTORS=192) of IRQ
>>> vectors that are available to each processor. Currently, when x2apic
>>> cluster mode is used (which is default), each vector is shared among
>>> all processors in the cluster. With many IRQs (as is the case on systems
>>> with multiple SR-IOV cards) and few clusters (e.g. single socket)
>>> there is a good chance that we will run out of vectors.
>>>
>>> This patch tries to decrease vector sharing between processors by
>>> assigning vector to a single processor if the assignment request (via
>>> __assign_irq_vector()) comes without explicitly specifying which
>>> processors are expected to share the interrupt. This typically happens
>>> during boot time (or possibly PCI hotplug) when create_irq(NUMA_NO_NODE)
>>> is called. When __assign_irq_vector() is called from
>>> set_desc_affinity() which provides sharing mask, vector sharing will
>>> continue to be performed, as before.
>> Wouldn't it be sufficient for people running into vector shortage due to
>> sharing to specify "x2apic_phys" on the command line?
>
>Yes, it would.
>
>The downside is that an installer (e.g. anaconda) would then have to
>figure out which systems need to be installed with this specific option.
>Even worse, this problem would show up on systems that have some of the
>processors taken offline (in BIOS) so during initial installation there
>would be no reason to switch to physical. Or a card might be added after
>the installation.
>
>In other words, I think x2apic_phys is more of a workaround in this case.

I'm not convinced; I'll wait to see if others have opinions one way or the other.

>>> @@ -72,9 +73,13 @@ static void __init clustered_apic_check_x2apic(void)
>>> {
>>> }
>>  >
>>> -static const cpumask_t *vector_allocation_cpumask_x2apic_cluster(int cpu)
>>> +static const cpumask_t *vector_allocation_cpumask_x2apic_cluster(int cpu,
>>> +    const cpumask_t *cpumask)
>>> {
>>> -    return per_cpu(cluster_cpus, cpu);
>>> +    if ( cpumask != TARGET_CPUS )
>> Is a pointer comparison (rather than a content one) here really correct in
>> the general case?
>
>When the caller is using TARGET_CPUS this is an indication that it
>explicitly didn't care about sharing (in assign_irq_vector()). A caller
>might want to have sharing on and provide a mask that has the same
>content as TARGET_CPUS but is stored in a different location. This will
>allow vector_allocation_cpumask_x2apic_cluster() to distinguish it from
>a "don't care" case.

Yes, I can see that intention. But you still wouldn't e.g. distinguish a caller
using TARGET_CPUS() from one passing &cpu_online_map.

>>> --- a/xen/include/asm-x86/mach-generic/mach_apic.h
>>> +++ b/xen/include/asm-x86/mach-generic/mach_apic.h
>>> @@ -13,10 +13,11 @@
>>> #define INT_DELIVERY_MODE (genapic->int_delivery_mode)
>>> #define INT_DEST_MODE (genapic->int_dest_mode)
>>> #define TARGET_CPUS      (genapic->target_cpus())
>>> -#define init_apic_ldr (genapic->init_apic_ldr)
>>> -#define clustered_apic_check (genapic->clustered_apic_check) 
>>> -#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
>>> -#define vector_allocation_cpumask(cpu) (genapic->vector_allocation_cpumask(cpu))
>>> +#define INIT_APIC_LDR (genapic->init_apic_ldr)
>>> +#define CLUSTERED_APIC_CHECK (genapic->clustered_apic_check) 
>>> +#define CPU_MASK_TO_APICID (genapic->cpu_mask_to_apicid)
>>> +#define VECTOR_ALLOCATION_CPUMASK(cpu, mask) \
>>> +    (genapic->vector_allocation_cpumask(cpu, mask))
>>  
>> I don't see the connection of the change in case of all of these to the purpose
>> of this patch.
>
>I need to include asm-x86/mach-generic/mach_apic.h in x2apic.c and

That's solely because of the use of TARGET_CPUS, isn't it? With the above
comment in mind, this could equally well be &cpu_online_map I think, making
explicit what is implicit right now.

>keeping those 4 as lower-case would mess up static initializers of
>apic_x2apic_phys and apic_x2apic_cluster. Besides, switching them to
>upper case would make them consistent with (what I think are) similar
>definitions of the 3 top macros above.

If you really want to go that route, do the renaming in a precursor patch
please.

Jan
Jan Beulich Aug. 7, 2017, 2:52 p.m. UTC | #5
>>> Andrew Cooper <andrew.cooper3@citrix.com> 08/07/17 4:39 PM >>>
>On 07/08/17 09:18, Jan Beulich wrote:
>> Wouldn't it be sufficient for people running into vector shortage due to
>> sharing to specify "x2apic_phys" on the command line?
>
>Had XenServer noticed this change in default earlier, I would have
>insisted that you make x2apic_phys the default.

Change in default? About five and a half years ago I did implement cluster
mode properly, but I wasn't able to spot a commit that changed the
default from physical to cluster.

>On affected systems, problem manifests as an upgrade where the new
>hypervisor doesn't boot.  Recovering from this requires human
>intervention during the server boot.

This needs some more explanation: Why would the hypervisor not boot?
Even if you said Dom0 didn't boot, it wouldn't really be clear to me why,
as running out of vectors would likely occur only after critical devices
had their drivers loaded.

Jan
Boris Ostrovsky Aug. 7, 2017, 3:05 p.m. UTC | #6
>>>> @@ -72,9 +73,13 @@ static void __init clustered_apic_check_x2apic(void)
>>>> {
>>>> }
>>>  >
>>>> -static const cpumask_t *vector_allocation_cpumask_x2apic_cluster(int cpu)
>>>> +static const cpumask_t *vector_allocation_cpumask_x2apic_cluster(int cpu,
>>>> +    const cpumask_t *cpumask)
>>>> {
>>>> -    return per_cpu(cluster_cpus, cpu);
>>>> +    if ( cpumask != TARGET_CPUS )
>>> Is a pointer comparison (rather than a content one) here really correct in
>>> the general case?
>> When the caller is using TARGET_CPUS this is an indication that it
>> explicitly didn't care about sharing (in assign_irq_vector()). A caller
>> might want to have sharing on and provide a mask that has the same
>> content as TARGET_CPUS but is stored in a different location. This will
>> allow vector_allocation_cpumask_x2apic_cluster() to distinguish it from
>> a "don't care" case.
> Yes, I can see that intention. But you still wouldn't e.g. distinguish a caller
> using TARGET_CPUS() from one passing &cpu_online_map.

True.

>
>>>> --- a/xen/include/asm-x86/mach-generic/mach_apic.h
>>>> +++ b/xen/include/asm-x86/mach-generic/mach_apic.h
>>>> @@ -13,10 +13,11 @@
>>>> #define INT_DELIVERY_MODE (genapic->int_delivery_mode)
>>>> #define INT_DEST_MODE (genapic->int_dest_mode)
>>>> #define TARGET_CPUS      (genapic->target_cpus())
>>>> -#define init_apic_ldr (genapic->init_apic_ldr)
>>>> -#define clustered_apic_check (genapic->clustered_apic_check) 
>>>> -#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
>>>> -#define vector_allocation_cpumask(cpu) (genapic->vector_allocation_cpumask(cpu))
>>>> +#define INIT_APIC_LDR (genapic->init_apic_ldr)
>>>> +#define CLUSTERED_APIC_CHECK (genapic->clustered_apic_check) 
>>>> +#define CPU_MASK_TO_APICID (genapic->cpu_mask_to_apicid)
>>>> +#define VECTOR_ALLOCATION_CPUMASK(cpu, mask) \
>>>> +    (genapic->vector_allocation_cpumask(cpu, mask))
>>>  
>>> I don't see the connection of the change in case of all of these to the purpose
>>> of this patch.
>> I need to include asm-x86/mach-generic/mach_apic.h in x2apic.c and
> That's solely because of the use of TARGET_CPUS, isn't it? With the above
> comment in mind, this could equally well be &cpu_online_map I think, making
> explicit what is implicit right now.

Yes, but it it won't prevent a caller from explicitly passing
&cpu_online_map neither.

How about allowing assign_irq_vector() pass NULL mask to
__assign_irq_vector() (I think this is the only place where it matters)?
__assign_irq_vector() can then turn it into TARGET_CPUS and pass NULL to
vector_allocation_cpumask().

-boris
Boris Ostrovsky Aug. 7, 2017, 3:12 p.m. UTC | #7
On 08/07/2017 10:52 AM, Jan Beulich wrote:
>>>> Andrew Cooper <andrew.cooper3@citrix.com> 08/07/17 4:39 PM >>>
>> On 07/08/17 09:18, Jan Beulich wrote:
>>> Wouldn't it be sufficient for people running into vector shortage due to
>>> sharing to specify "x2apic_phys" on the command line?
>> Had XenServer noticed this change in default earlier, I would have
>> insisted that you make x2apic_phys the default.
> Change in default? About five and a half years ago I did implement cluster
> mode properly, but I wasn't able to spot a commit that changed the
> default from physical to cluster.
>
>> On affected systems, problem manifests as an upgrade where the new
>> hypervisor doesn't boot.  Recovering from this requires human
>> intervention during the server boot.
> This needs some more explanation: Why would the hypervisor not boot?
> Even if you said Dom0 didn't boot, it wouldn't really be clear to me why,
> as running out of vectors would likely occur only after critical devices
> had their drivers loaded.


If your critical device is a networking card with lots of functions? In
my case I had two cards. The first one was plumbed properly but the
second failed. It so happened that only the first one was required to
boot but I can see that in another configuration an init script might
require both to be initialized.


-boris
Jan Beulich Aug. 7, 2017, 3:36 p.m. UTC | #8
>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 08/07/17 5:06 PM >>>
>>>>> --- a/xen/include/asm-x86/mach-generic/mach_apic.h
>>>>> +++ b/xen/include/asm-x86/mach-generic/mach_apic.h
>>>>> @@ -13,10 +13,11 @@
>>>>> #define INT_DELIVERY_MODE (genapic->int_delivery_mode)
>>>>> #define INT_DEST_MODE (genapic->int_dest_mode)
>>>>> #define TARGET_CPUS      (genapic->target_cpus())
>>>>> -#define init_apic_ldr (genapic->init_apic_ldr)
>>>>> -#define clustered_apic_check (genapic->clustered_apic_check) 
>>>>> -#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
>>>>> -#define vector_allocation_cpumask(cpu) (genapic->vector_allocation_cpumask(cpu))
>>>>> +#define INIT_APIC_LDR (genapic->init_apic_ldr)
>>>>> +#define CLUSTERED_APIC_CHECK (genapic->clustered_apic_check) 
>>>>> +#define CPU_MASK_TO_APICID (genapic->cpu_mask_to_apicid)
>>>>> +#define VECTOR_ALLOCATION_CPUMASK(cpu, mask) \
>>>>> +    (genapic->vector_allocation_cpumask(cpu, mask))
>>>>  
>>>> I don't see the connection of the change in case of all of these to the purpose
>>>> of this patch.
>>> I need to include asm-x86/mach-generic/mach_apic.h in x2apic.c and
>> That's solely because of the use of TARGET_CPUS, isn't it? With the above
>> comment in mind, this could equally well be &cpu_online_map I think, making
>> explicit what is implicit right now.
>
>Yes, but it it won't prevent a caller from explicitly passing
>&cpu_online_map neither.

Right, but hence the suggestion to explicitly compare with &cpu_online_map.

>How about allowing assign_irq_vector() pass NULL mask to
>__assign_irq_vector() (I think this is the only place where it matters)?
>__assign_irq_vector() can then turn it into TARGET_CPUS and pass NULL to
>vector_allocation_cpumask().

That ought to be workable afaics.

Jan
Jan Beulich Aug. 7, 2017, 3:38 p.m. UTC | #9
>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 08/07/17 5:12 PM >>>
>On 08/07/2017 10:52 AM, Jan Beulich wrote:
>>>>> Andrew Cooper <andrew.cooper3@citrix.com> 08/07/17 4:39 PM >>>
>>> On 07/08/17 09:18, Jan Beulich wrote:
>>>> Wouldn't it be sufficient for people running into vector shortage due to
>>>> sharing to specify "x2apic_phys" on the command line?
>>> Had XenServer noticed this change in default earlier, I would have
>>> insisted that you make x2apic_phys the default.
>> Change in default? About five and a half years ago I did implement cluster
>> mode properly, but I wasn't able to spot a commit that changed the
>> default from physical to cluster.
>>
>>> On affected systems, problem manifests as an upgrade where the new
>>> hypervisor doesn't boot.  Recovering from this requires human
>>> intervention during the server boot.
>> This needs some more explanation: Why would the hypervisor not boot?
>> Even if you said Dom0 didn't boot, it wouldn't really be clear to me why,
>> as running out of vectors would likely occur only after critical devices
>> had their drivers loaded.
>
>
>If your critical device is a networking card with lots of functions? In
>my case I had two cards. The first one was plumbed properly but the
>second failed. It so happened that only the first one was required to
>boot but I can see that in another configuration an init script might
>require both to be initialized.

So a single NIC drove the system out of vectors? That's insane, I would
say, i.e. I'd call this a misconfigured system. But yeah, if we really want
to get such a thing to work despite the insanity ...

Jan
Andrew Cooper Aug. 7, 2017, 3:39 p.m. UTC | #10
On 07/08/17 16:38, Jan Beulich wrote:
>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 08/07/17 5:12 PM >>>
>> On 08/07/2017 10:52 AM, Jan Beulich wrote:
>>>>>> Andrew Cooper <andrew.cooper3@citrix.com> 08/07/17 4:39 PM >>>
>>>> On 07/08/17 09:18, Jan Beulich wrote:
>>>>> Wouldn't it be sufficient for people running into vector shortage due to
>>>>> sharing to specify "x2apic_phys" on the command line?
>>>> Had XenServer noticed this change in default earlier, I would have
>>>> insisted that you make x2apic_phys the default.
>>> Change in default? About five and a half years ago I did implement cluster
>>> mode properly, but I wasn't able to spot a commit that changed the
>>> default from physical to cluster.
>>>
>>>> On affected systems, problem manifests as an upgrade where the new
>>>> hypervisor doesn't boot.  Recovering from this requires human
>>>> intervention during the server boot.
>>> This needs some more explanation: Why would the hypervisor not boot?
>>> Even if you said Dom0 didn't boot, it wouldn't really be clear to me why,
>>> as running out of vectors would likely occur only after critical devices
>>> had their drivers loaded.
>>
>> If your critical device is a networking card with lots of functions? In
>> my case I had two cards. The first one was plumbed properly but the
>> second failed. It so happened that only the first one was required to
>> boot but I can see that in another configuration an init script might
>> require both to be initialized.
> So a single NIC drove the system out of vectors? That's insane, I would
> say, i.e. I'd call this a misconfigured system. But yeah, if we really want
> to get such a thing to work despite the insanity ...

No.  That's one single (dual headed) card with 128 virtual functions each.

~Andrew
Jan Beulich Aug. 7, 2017, 4:06 p.m. UTC | #11
>>> Andrew Cooper <andrew.cooper3@citrix.com> 08/07/17 5:40 PM >>>
>On 07/08/17 16:38, Jan Beulich wrote:
>> So a single NIC drove the system out of vectors? That's insane, I would
>> say, i.e. I'd call this a misconfigured system. But yeah, if we really want
>> to get such a thing to work despite the insanity ...
>
>No.  That's one single (dual headed) card with 128 virtual functions each.

But putting such in a single-core system is, well, not very reasonable. At
the very least I'd expect the admin to limit the number of VFs then, or not
load a driver in Dom0 for them (but only for the PF).

Jan
Andrew Cooper Aug. 7, 2017, 4:20 p.m. UTC | #12
On 07/08/17 17:06, Jan Beulich wrote:
>>>> Andrew Cooper <andrew.cooper3@citrix.com> 08/07/17 5:40 PM >>>
>> On 07/08/17 16:38, Jan Beulich wrote:
>>> So a single NIC drove the system out of vectors? That's insane, I would
>>> say, i.e. I'd call this a misconfigured system. But yeah, if we really want
>>> to get such a thing to work despite the insanity ...
>> No.  That's one single (dual headed) card with 128 virtual functions each.
> But putting such in a single-core system is, well, not very reasonable. At
> the very least I'd expect the admin to limit the number of VFs then, or not
> load a driver in Dom0 for them (but only for the PF).

Its not a single core system, but cluster mode allocates vectors across
clusters, not cores. This turns even the largest server system into a
single core system as far as vector availability goes.

There was definitely a change in behaviour between Xen 4.1 and Xen 4.4. 
Looking through the ticket, it appears the regression was introduced by
your cluster IPI broadcast optimisation (which iirc was for the 64 vcpu
windows issue), but I'm struggling to locate it in Xens history.

~Andrew
Boris Ostrovsky Aug. 7, 2017, 4:49 p.m. UTC | #13
On 08/07/2017 12:20 PM, Andrew Cooper wrote:
> On 07/08/17 17:06, Jan Beulich wrote:
>>>>> Andrew Cooper <andrew.cooper3@citrix.com> 08/07/17 5:40 PM >>>
>>> On 07/08/17 16:38, Jan Beulich wrote:
>>>> So a single NIC drove the system out of vectors? That's insane, I would
>>>> say, i.e. I'd call this a misconfigured system. But yeah, if we really want
>>>> to get such a thing to work despite the insanity ...
>>> No.  That's one single (dual headed) card with 128 virtual functions each.
>> But putting such in a single-core system is, well, not very reasonable. At
>> the very least I'd expect the admin to limit the number of VFs then, or not
>> load a driver in Dom0 for them (but only for the PF).
> Its not a single core system, but cluster mode allocates vectors across
> clusters, not cores. This turns even the largest server system into a
> single core system as far as vector availability goes.

In my case we have enough vectors on a fully-populated system (4
sockets), which is how it was installed.

However, we want to be able to reduce the number of processors and one
such configuration is to limit processors to a single socket (which
translates to a single cluster as far APIC is concerned). Requiring
admins to also reduce number of virtual functions is probably somewhat
complicated to explain/document (especially since this would depend on
number of cores on a socket).

-boris
Andrew Cooper Aug. 7, 2017, 5:19 p.m. UTC | #14
On 07/08/17 17:49, Boris Ostrovsky wrote:
> On 08/07/2017 12:20 PM, Andrew Cooper wrote:
>> On 07/08/17 17:06, Jan Beulich wrote:
>>>>>> Andrew Cooper <andrew.cooper3@citrix.com> 08/07/17 5:40 PM >>>
>>>> On 07/08/17 16:38, Jan Beulich wrote:
>>>>> So a single NIC drove the system out of vectors? That's insane, I would
>>>>> say, i.e. I'd call this a misconfigured system. But yeah, if we really want
>>>>> to get such a thing to work despite the insanity ...
>>>> No.  That's one single (dual headed) card with 128 virtual functions each.
>>> But putting such in a single-core system is, well, not very reasonable. At
>>> the very least I'd expect the admin to limit the number of VFs then, or not
>>> load a driver in Dom0 for them (but only for the PF).
>> Its not a single core system, but cluster mode allocates vectors across
>> clusters, not cores. This turns even the largest server system into a
>> single core system as far as vector availability goes.
> In my case we have enough vectors on a fully-populated system (4
> sockets), which is how it was installed.
>
> However, we want to be able to reduce the number of processors and one
> such configuration is to limit processors to a single socket (which
> translates to a single cluster as far APIC is concerned). Requiring
> admins to also reduce number of virtual functions is probably somewhat
> complicated to explain/document (especially since this would depend on
> number of cores on a socket).

Netscalars usecase is a dual-socket system, but it already having to
under-utilise the interrupt capability of the hardware to be able to fit
a single interrupt from every device into the vector-space available in Xen.

More than once, we've discussed freeing up the legacy PIC range of
vectors, and freeing up the unallocated vectors in 0xfx.

~Andrew
Jan Beulich Aug. 8, 2017, 9:46 a.m. UTC | #15
>>> Andrew Cooper <andrew.cooper3@citrix.com> 08/07/17 6:21 PM >>>
>On 07/08/17 17:06, Jan Beulich wrote:
>>>>> Andrew Cooper <andrew.cooper3@citrix.com> 08/07/17 5:40 PM >>>
>>> On 07/08/17 16:38, Jan Beulich wrote:
>>>> So a single NIC drove the system out of vectors? That's insane, I would
>>>> say, i.e. I'd call this a misconfigured system. But yeah, if we really want
>>>> to get such a thing to work despite the insanity ...
>>> No.  That's one single (dual headed) card with 128 virtual functions each.
>> But putting such in a single-core system is, well, not very reasonable. At
>> the very least I'd expect the admin to limit the number of VFs then, or not
>> load a driver in Dom0 for them (but only for the PF).
>
>Its not a single core system, but cluster mode allocates vectors across
>clusters, not cores. This turns even the largest server system into a
>single core system as far as vector availability goes.
>
>There was definitely a change in behaviour between Xen 4.1 and Xen 4.4. 
>Looking through the ticket, it appears the regression was introduced by
>your cluster IPI broadcast optimisation (which iirc was for the 64 vcpu
>windows issue), but I'm struggling to locate it in Xens history.

But that had nothing to do with the number of vectors used/needed.

Jan
Jan Beulich Aug. 8, 2017, 9:48 a.m. UTC | #16
>>> Andrew Cooper <andrew.cooper3@citrix.com> 08/07/17 7:19 PM >>>
>More than once, we've discussed freeing up the legacy PIC range of
>vectors, and freeing up the unallocated vectors in 0xfx.

And that's something I would prefer over the change proposed here,
albeit I realize that the amount of vectors to recover isn't that large.

Jan
Boris Ostrovsky Aug. 8, 2017, 1:25 p.m. UTC | #17
On 08/08/2017 05:48 AM, Jan Beulich wrote:
>>>> Andrew Cooper <andrew.cooper3@citrix.com> 08/07/17 7:19 PM >>>
>> More than once, we've discussed freeing up the legacy PIC range of
>> vectors, and freeing up the unallocated vectors in 0xfx.
> And that's something I would prefer over the change proposed here,
> albeit I realize that the amount of vectors to recover isn't that large.


That would only net us 32 vectors at the most. That probably wouldn't be
sufficient in many cases where we are currently having vector shortage.
Unfortunately I don't have access to my failing system to say how may
vectors I was short but I believe it was more than that.

(I am also not sure I understand what is meant by "freeing". Where are
they going?)

-boris
Andrew Cooper Aug. 8, 2017, 1:28 p.m. UTC | #18
On 08/08/17 14:25, Boris Ostrovsky wrote:
> On 08/08/2017 05:48 AM, Jan Beulich wrote:
>>>>> Andrew Cooper <andrew.cooper3@citrix.com> 08/07/17 7:19 PM >>>
>>> More than once, we've discussed freeing up the legacy PIC range of
>>> vectors, and freeing up the unallocated vectors in 0xfx.
>> And that's something I would prefer over the change proposed here,
>> albeit I realize that the amount of vectors to recover isn't that large.
> That would only net us 32 vectors at the most. That probably wouldn't be
> sufficient in many cases where we are currently having vector shortage.
> Unfortunately I don't have access to my failing system to say how may
> vectors I was short but I believe it was more than that.

Indeed, which is why I haven't actually undertaken the work.  It doesn't
come close to solving the problem.

> (I am also not sure I understand what is meant by "freeing". Where are
> they going?)

By "freeing", I mean "available for general irq use" rather than being
reserved and unused in the system.

~Andrew
Boris Ostrovsky Aug. 8, 2017, 2:08 p.m. UTC | #19
On 08/08/2017 09:28 AM, Andrew Cooper wrote:
> On 08/08/17 14:25, Boris Ostrovsky wrote:
>> On 08/08/2017 05:48 AM, Jan Beulich wrote:
>>>>>> Andrew Cooper <andrew.cooper3@citrix.com> 08/07/17 7:19 PM >>>
>>>> More than once, we've discussed freeing up the legacy PIC range of
>>>> vectors, and freeing up the unallocated vectors in 0xfx.
>>> And that's something I would prefer over the change proposed here,
>>> albeit I realize that the amount of vectors to recover isn't that large.
>> That would only net us 32 vectors at the most. That probably wouldn't be
>> sufficient in many cases where we are currently having vector shortage.
>> Unfortunately I don't have access to my failing system to say how may
>> vectors I was short but I believe it was more than that.
> Indeed, which is why I haven't actually undertaken the work.  It doesn't
> come close to solving the problem.
>
>> (I am also not sure I understand what is meant by "freeing". Where are
>> they going?)
> By "freeing", I mean "available for general irq use" rather than being
> reserved and unused in the system.
>


OK, so it's then definitely less than 32 vectors that would become
available.

I will send v2 of this patch.

-boris
diff mbox

Patch

diff --git a/xen/arch/x86/acpi/boot.c b/xen/arch/x86/acpi/boot.c
index 8e6c96d..c16b14a 100644
--- a/xen/arch/x86/acpi/boot.c
+++ b/xen/arch/x86/acpi/boot.c
@@ -645,7 +645,7 @@  static void __init acpi_process_madt(void)
 				acpi_ioapic = true;
 
 				smp_found_config = true;
-				clustered_apic_check();
+				CLUSTERED_APIC_CHECK();
 			}
 		}
 		if (error == -EINVAL) {
diff --git a/xen/arch/x86/apic.c b/xen/arch/x86/apic.c
index 851a6cc..a0e1798 100644
--- a/xen/arch/x86/apic.c
+++ b/xen/arch/x86/apic.c
@@ -544,7 +544,7 @@  void setup_local_APIC(void)
      * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
      * document number 292116).  So here it goes...
      */
-    init_apic_ldr();
+    INIT_APIC_LDR();
 
     /*
      * Set Task Priority to reject any interrupts below FIRST_DYNAMIC_VECTOR.
diff --git a/xen/arch/x86/genapic/delivery.c b/xen/arch/x86/genapic/delivery.c
index ced92a1..d71c01c 100644
--- a/xen/arch/x86/genapic/delivery.c
+++ b/xen/arch/x86/genapic/delivery.c
@@ -30,7 +30,7 @@  void __init clustered_apic_check_flat(void)
 	printk("Enabling APIC mode:  Flat.  Using %d I/O APICs\n", nr_ioapics);
 }
 
-const cpumask_t *vector_allocation_cpumask_flat(int cpu)
+const cpumask_t *vector_allocation_cpumask_flat(int cpu, const cpumask_t *cpumask)
 {
 	return &cpu_online_map;
 } 
@@ -58,7 +58,7 @@  void __init clustered_apic_check_phys(void)
 	printk("Enabling APIC mode:  Phys.  Using %d I/O APICs\n", nr_ioapics);
 }
 
-const cpumask_t *vector_allocation_cpumask_phys(int cpu)
+const cpumask_t *vector_allocation_cpumask_phys(int cpu, const cpumask_t *cpumask)
 {
 	return cpumask_of(cpu);
 }
diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c
index 5fffb31..c0b97c9 100644
--- a/xen/arch/x86/genapic/x2apic.c
+++ b/xen/arch/x86/genapic/x2apic.c
@@ -27,6 +27,7 @@ 
 #include <asm/processor.h>
 #include <xen/smp.h>
 #include <asm/mach-default/mach_mpparse.h>
+#include <asm/mach-generic/mach_apic.h>
 
 static DEFINE_PER_CPU_READ_MOSTLY(u32, cpu_2_logical_apicid);
 static DEFINE_PER_CPU_READ_MOSTLY(cpumask_t *, cluster_cpus);
@@ -72,9 +73,13 @@  static void __init clustered_apic_check_x2apic(void)
 {
 }
 
-static const cpumask_t *vector_allocation_cpumask_x2apic_cluster(int cpu)
+static const cpumask_t *vector_allocation_cpumask_x2apic_cluster(int cpu,
+    const cpumask_t *cpumask)
 {
-    return per_cpu(cluster_cpus, cpu);
+    if ( cpumask != TARGET_CPUS )
+        return per_cpu(cluster_cpus, cpu);
+    else
+        return cpumask_of(cpu);
 }
 
 static unsigned int cpu_mask_to_apicid_x2apic_cluster(const cpumask_t *cpumask)
diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
index 2838f6b..3eefcfc 100644
--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -1038,7 +1038,7 @@  static void __init setup_IO_APIC_irqs(void)
                 disable_8259A_irq(irq_to_desc(irq));
 
             desc = irq_to_desc(irq);
-            SET_DEST(entry, logical, cpu_mask_to_apicid(TARGET_CPUS));
+            SET_DEST(entry, logical, CPU_MASK_TO_APICID(TARGET_CPUS));
             spin_lock_irqsave(&ioapic_lock, flags);
             __ioapic_write_entry(apic, pin, 0, entry);
             set_native_irq_info(irq, TARGET_CPUS);
@@ -1070,7 +1070,7 @@  static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
      */
     entry.dest_mode = INT_DEST_MODE;
     entry.mask = 0;					/* unmask IRQ now */
-    SET_DEST(entry, logical, cpu_mask_to_apicid(TARGET_CPUS));
+    SET_DEST(entry, logical, CPU_MASK_TO_APICID(TARGET_CPUS));
     entry.delivery_mode = INT_DELIVERY_MODE;
     entry.polarity = 0;
     entry.trigger = 0;
@@ -2236,7 +2236,7 @@  int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
     /* Don't chance ending up with an empty mask. */
     if (cpumask_intersects(&mask, desc->arch.cpu_mask))
         cpumask_and(&mask, &mask, desc->arch.cpu_mask);
-    SET_DEST(entry, logical, cpu_mask_to_apicid(&mask));
+    SET_DEST(entry, logical, CPU_MASK_TO_APICID(&mask));
 
     apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
 		"(%d-%d -> %#x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
@@ -2423,7 +2423,7 @@  int ioapic_guest_write(unsigned long physbase, unsigned int reg, u32 val)
     /* Set the vector field to the real vector! */
     rte.vector = desc->arch.vector;
 
-    SET_DEST(rte, logical, cpu_mask_to_apicid(desc->arch.cpu_mask));
+    SET_DEST(rte, logical, CPU_MASK_TO_APICID(desc->arch.cpu_mask));
 
     __ioapic_write_entry(apic, pin, 0, rte);
     
diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c
index 57e6c18..227a549 100644
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -484,7 +484,7 @@  static int __assign_irq_vector(
         if (!cpu_online(cpu))
             continue;
 
-        cpumask_and(&tmp_mask, vector_allocation_cpumask(cpu),
+        cpumask_and(&tmp_mask, VECTOR_ALLOCATION_CPUMASK(cpu, mask),
                     &cpu_online_map);
 
         vector = current_vector;
@@ -748,7 +748,7 @@  unsigned int set_desc_affinity(struct irq_desc *desc, const cpumask_t *mask)
     cpumask_copy(desc->affinity, mask);
     cpumask_and(&dest_mask, mask, desc->arch.cpu_mask);
 
-    return cpu_mask_to_apicid(&dest_mask);
+    return CPU_MASK_TO_APICID(&dest_mask);
 }
 
 /* For re-setting irq interrupt affinity for specific irq */
diff --git a/xen/arch/x86/mpparse.c b/xen/arch/x86/mpparse.c
index a1a0738..4c44f4b 100644
--- a/xen/arch/x86/mpparse.c
+++ b/xen/arch/x86/mpparse.c
@@ -396,7 +396,7 @@  static int __init smp_read_mpc(struct mp_config_table *mpc)
 			}
 		}
 	}
-	clustered_apic_check();
+	CLUSTERED_APIC_CHECK();
 	if (!num_processors)
 		printk(KERN_ERR "SMP mptable: no processors registered!\n");
 	return num_processors;
diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c
index 77998f4..1f55c98 100644
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -171,7 +171,7 @@  void msi_compose_msg(unsigned vector, const cpumask_t *cpu_mask, struct msi_msg
             return;
 
         cpumask_and(mask, cpu_mask, &cpu_online_map);
-        msg->dest32 = cpu_mask_to_apicid(mask);
+        msg->dest32 = CPU_MASK_TO_APICID(mask);
     }
 
     msg->address_hi = MSI_ADDR_BASE_HI;
diff --git a/xen/include/asm-x86/genapic.h b/xen/include/asm-x86/genapic.h
index 5496ab0..799bed8 100644
--- a/xen/include/asm-x86/genapic.h
+++ b/xen/include/asm-x86/genapic.h
@@ -34,7 +34,8 @@  struct genapic {
 	void (*init_apic_ldr)(void);
 	void (*clustered_apic_check)(void);
 	const cpumask_t *(*target_cpus)(void);
-	const cpumask_t *(*vector_allocation_cpumask)(int cpu);
+	const cpumask_t *(*vector_allocation_cpumask)(int cpu,
+                                                      const cpumask_t *mask);
 	unsigned int (*cpu_mask_to_apicid)(const cpumask_t *cpumask);
 	void (*send_IPI_mask)(const cpumask_t *mask, int vector);
     void (*send_IPI_self)(uint8_t vector);
@@ -58,7 +59,8 @@  void init_apic_ldr_flat(void);
 void clustered_apic_check_flat(void);
 unsigned int cpu_mask_to_apicid_flat(const cpumask_t *cpumask);
 void send_IPI_mask_flat(const cpumask_t *mask, int vector);
-const cpumask_t *vector_allocation_cpumask_flat(int cpu);
+const cpumask_t *vector_allocation_cpumask_flat(int cpu,
+                                                const cpumask_t *cpumask);
 #define GENAPIC_FLAT \
 	.int_delivery_mode = dest_LowestPrio, \
 	.int_dest_mode = 1 /* logical delivery */, \
@@ -74,7 +76,8 @@  void init_apic_ldr_phys(void);
 void clustered_apic_check_phys(void);
 unsigned int cpu_mask_to_apicid_phys(const cpumask_t *cpumask);
 void send_IPI_mask_phys(const cpumask_t *mask, int vector);
-const cpumask_t *vector_allocation_cpumask_phys(int cpu);
+const cpumask_t *vector_allocation_cpumask_phys(int cpu,
+                                                const cpumask_t *cpumask);
 #define GENAPIC_PHYS \
 	.int_delivery_mode = dest_Fixed, \
 	.int_dest_mode = 0 /* physical delivery */, \
diff --git a/xen/include/asm-x86/mach-generic/mach_apic.h b/xen/include/asm-x86/mach-generic/mach_apic.h
index 03e9e8a..999ad77 100644
--- a/xen/include/asm-x86/mach-generic/mach_apic.h
+++ b/xen/include/asm-x86/mach-generic/mach_apic.h
@@ -13,10 +13,11 @@ 
 #define INT_DELIVERY_MODE (genapic->int_delivery_mode)
 #define INT_DEST_MODE (genapic->int_dest_mode)
 #define TARGET_CPUS	  (genapic->target_cpus())
-#define init_apic_ldr (genapic->init_apic_ldr)
-#define clustered_apic_check (genapic->clustered_apic_check) 
-#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
-#define vector_allocation_cpumask(cpu) (genapic->vector_allocation_cpumask(cpu))
+#define INIT_APIC_LDR (genapic->init_apic_ldr)
+#define CLUSTERED_APIC_CHECK (genapic->clustered_apic_check) 
+#define CPU_MASK_TO_APICID (genapic->cpu_mask_to_apicid)
+#define VECTOR_ALLOCATION_CPUMASK(cpu, mask) \
+    (genapic->vector_allocation_cpumask(cpu, mask))
 
 static inline void enable_apic_mode(void)
 {