diff mbox series

x86/io-apic: fix directed EOI when using AMd-Vi interrupt remapping

Message ID 20241018080813.45759-1-roger.pau@citrix.com (mailing list archive)
State New
Headers show
Series x86/io-apic: fix directed EOI when using AMd-Vi interrupt remapping | expand

Commit Message

Roger Pau Monné Oct. 18, 2024, 8:08 a.m. UTC
When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
repurposed to contain part of the offset into the remapping table.  Previous to
2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
table would match the vector.  Such logic was mandatory for end of interrupt to
work, since the vector field (even when not containing a vector) is used by the
IO-APIC to find for which pin the EOI must be performed.

Introduce a table to store the EOI handlers when using interrupt remapping, so
that the IO-APIC driver can translate pins into EOI handlers without having to
read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
unconditionally when interrupt remapping is enabled, even if strictly it would
only be required for AMD-Vi.

Reported-by: Willi Junga <xenproject@ymy.be>
Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
 xen/arch/x86/io_apic.c | 47 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

Comments

Marek Marczykowski-Górecki Oct. 19, 2024, 3:23 a.m. UTC | #1
On Fri, Oct 18, 2024 at 10:08:13AM +0200, Roger Pau Monne wrote:
> When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
> repurposed to contain part of the offset into the remapping table.  Previous to
> 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
> table would match the vector.  Such logic was mandatory for end of interrupt to
> work, since the vector field (even when not containing a vector) is used by the
> IO-APIC to find for which pin the EOI must be performed.
> 
> Introduce a table to store the EOI handlers when using interrupt remapping, so
> that the IO-APIC driver can translate pins into EOI handlers without having to
> read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
> unconditionally when interrupt remapping is enabled, even if strictly it would
> only be required for AMD-Vi.
> 
> Reported-by: Willi Junga <xenproject@ymy.be>
> Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
> Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>

I can confirm it fixes touchpad issue on Framework 13 AMD,
it works without ioapic_ack=new now, thanks!
Tested-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>

> ---
>  xen/arch/x86/io_apic.c | 47 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 47 insertions(+)
> 
> diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
> index e40d2f7dbd75..8856eb29d275 100644
> --- a/xen/arch/x86/io_apic.c
> +++ b/xen/arch/x86/io_apic.c
> @@ -71,6 +71,22 @@ static int apic_pin_2_gsi_irq(int apic, int pin);
>  
>  static vmask_t *__read_mostly vector_map[MAX_IO_APICS];
>  
> +/*
> + * Store the EOI handle when using interrupt remapping.
> + *
> + * If using AMD-Vi interrupt remapping the IO-APIC redirection entry remapped
> + * format repurposes the vector field to store the offset into the Interrupt
> + * Remap table.  This causes directed EOI to longer work, as the CPU vector no
> + * longer matches the contents of the RTE vector field.  Add a translation
> + * table so that directed EOI uses the value in the RTE vector field when
> + * interrupt remapping is enabled.
> + *
> + * Note Intel VT-d Xen code still stores the CPU vector in the RTE vector field
> + * when using the remapped format, but use the translation table uniformly in
> + * order to avoid extra logic to differentiate between VT-d and AMD-Vi.
> + */
> +static unsigned int **apic_pin_eoi;
> +
>  static void share_vector_maps(unsigned int src, unsigned int dst)
>  {
>      unsigned int pin;
> @@ -273,6 +289,13 @@ void __ioapic_write_entry(
>      {
>          __io_apic_write(apic, 0x11 + 2 * pin, eu.w2);
>          __io_apic_write(apic, 0x10 + 2 * pin, eu.w1);
> +        /*
> +         * Might be called before apic_pin_eoi is allocated.  Entry will be
> +         * updated once the array is allocated and there's an EOI or write
> +         * against the pin.
> +         */
> +        if ( apic_pin_eoi )
> +            apic_pin_eoi[apic][pin] = e.vector;
>      }
>      else
>          iommu_update_ire_from_apic(apic, pin, e.raw);
> @@ -298,9 +321,17 @@ static void __io_apic_eoi(unsigned int apic, unsigned int vector, unsigned int p
>      /* Prefer the use of the EOI register if available */
>      if ( ioapic_has_eoi_reg(apic) )
>      {
> +        if ( apic_pin_eoi )
> +            vector = apic_pin_eoi[apic][pin];
> +
>          /* If vector is unknown, read it from the IO-APIC */
>          if ( vector == IRQ_VECTOR_UNASSIGNED )
> +        {
>              vector = __ioapic_read_entry(apic, pin, true).vector;
> +            if ( apic_pin_eoi )
> +                /* Update cached value so further EOI don't need to fetch it. */
> +                apic_pin_eoi[apic][pin] = vector;
> +        }
>  
>          *(IO_APIC_BASE(apic)+16) = vector;
>      }
> @@ -1022,7 +1053,23 @@ static void __init setup_IO_APIC_irqs(void)
>  
>      apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
>  
> +    if ( iommu_intremap )
> +    {
> +        apic_pin_eoi = xzalloc_array(typeof(*apic_pin_eoi), nr_ioapics);
> +        BUG_ON(!apic_pin_eoi);
> +    }
> +
>      for (apic = 0; apic < nr_ioapics; apic++) {
> +        if ( iommu_intremap )
> +        {
> +            apic_pin_eoi[apic] = xmalloc_array(typeof(**apic_pin_eoi),
> +                                               nr_ioapic_entries[apic]);
> +            BUG_ON(!apic_pin_eoi[apic]);
> +
> +            for ( pin = 0; pin < nr_ioapic_entries[apic]; pin++ )
> +                apic_pin_eoi[apic][pin] = IRQ_VECTOR_UNASSIGNED;
> +        }
> +
>          for (pin = 0; pin < nr_ioapic_entries[apic]; pin++) {
>              /*
>               * add it to the IO-APIC irq-routing table:
> -- 
> 2.46.0
> 
>
Alejandro Vallejo Oct. 21, 2024, 9:55 a.m. UTC | #2
On Fri Oct 18, 2024 at 9:08 AM BST, Roger Pau Monne wrote:
> When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
> repurposed to contain part of the offset into the remapping table.  Previous to

For my own education. Is that really a repurpose? Isn't the RTE vector field
itself simply remapped, just like any MSI?

> 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
> table would match the vector.  Such logic was mandatory for end of interrupt to
> work, since the vector field (even when not containing a vector) is used by the
> IO-APIC to find for which pin the EOI must be performed.
>
> Introduce a table to store the EOI handlers when using interrupt remapping, so

The table seems to store the pre-IR vectors. Is this a matter of nomenclature
or leftover from a previous implementation?

> that the IO-APIC driver can translate pins into EOI handlers without having to
> read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
> unconditionally when interrupt remapping is enabled, even if strictly it would
> only be required for AMD-Vi.

Given that last statement it might be worth mentioning that the table is
bypassed when IR is off as well.

>
> Reported-by: Willi Junga <xenproject@ymy.be>
> Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
> Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> ---
>  xen/arch/x86/io_apic.c | 47 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 47 insertions(+)
>
> diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
> index e40d2f7dbd75..8856eb29d275 100644
> --- a/xen/arch/x86/io_apic.c
> +++ b/xen/arch/x86/io_apic.c
> @@ -71,6 +71,22 @@ static int apic_pin_2_gsi_irq(int apic, int pin);
>  
>  static vmask_t *__read_mostly vector_map[MAX_IO_APICS];
>  
> +/*
> + * Store the EOI handle when using interrupt remapping.

That explains the when, but not the what. This is "a LUT from IOAPIC pin to its
vector field", as far as I can see. 

The order in which it's meant to be indexed would be a good addition here as
well. I had to scroll down to see how it was used to really see what this was.

> + *
> + * If using AMD-Vi interrupt remapping the IO-APIC redirection entry remapped
> + * format repurposes the vector field to store the offset into the Interrupt
> + * Remap table.  This causes directed EOI to longer work, as the CPU vector no
> + * longer matches the contents of the RTE vector field.  Add a translation
> + * table so that directed EOI uses the value in the RTE vector field when

nit: Might be worth mentioning that it's a merely cache and is populated
on-demand from authoritative state in the IOAPIC.

> + * interrupt remapping is enabled.
> + *
> + * Note Intel VT-d Xen code still stores the CPU vector in the RTE vector field
> + * when using the remapped format, but use the translation table uniformly in
> + * order to avoid extra logic to differentiate between VT-d and AMD-Vi.
> + */
> +static unsigned int **apic_pin_eoi;

This should be signed to allow IRQ_VECTOR_UNASSIGNED, I think. Possibly
int16_t, matching arch_irq_desc->vector. This raises doubts about the existing
vectors here typed as unsigned too.

On naming, I'd rather see ioapic rather than apic, but that's a an existing sin
in the whole file. Otherwise, while it's used for EOI ATM, isn't it really just
an ioapic_pin_vector?

> +
>  static void share_vector_maps(unsigned int src, unsigned int dst)
>  {
>      unsigned int pin;
> @@ -273,6 +289,13 @@ void __ioapic_write_entry(
>      {
>          __io_apic_write(apic, 0x11 + 2 * pin, eu.w2);
>          __io_apic_write(apic, 0x10 + 2 * pin, eu.w1);
> +        /*
> +         * Might be called before apic_pin_eoi is allocated.  Entry will be
> +         * updated once the array is allocated and there's an EOI or write
> +         * against the pin.
> +         */
> +        if ( apic_pin_eoi )
> +            apic_pin_eoi[apic][pin] = e.vector;
>      }
>      else
>          iommu_update_ire_from_apic(apic, pin, e.raw);
> @@ -298,9 +321,17 @@ static void __io_apic_eoi(unsigned int apic, unsigned int vector, unsigned int p

Out of curiosity, how could this vector come to be unassigned as a parameter?
The existing code seems to assume that may happen.

>      /* Prefer the use of the EOI register if available */
>      if ( ioapic_has_eoi_reg(apic) )
>      {
> +        if ( apic_pin_eoi )
> +            vector = apic_pin_eoi[apic][pin];
> +
>          /* If vector is unknown, read it from the IO-APIC */
>          if ( vector == IRQ_VECTOR_UNASSIGNED )
> +        {
>              vector = __ioapic_read_entry(apic, pin, true).vector;
> +            if ( apic_pin_eoi )
> +                /* Update cached value so further EOI don't need to fetch it. */
> +                apic_pin_eoi[apic][pin] = vector;
> +        }
>  
>          *(IO_APIC_BASE(apic)+16) = vector;
>      }
> @@ -1022,7 +1053,23 @@ static void __init setup_IO_APIC_irqs(void)
>  
>      apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
>  
> +    if ( iommu_intremap )
> +    {
> +        apic_pin_eoi = xzalloc_array(typeof(*apic_pin_eoi), nr_ioapics);
> +        BUG_ON(!apic_pin_eoi);
> +    }
> +
>      for (apic = 0; apic < nr_ioapics; apic++) {

Was here before, but it might be a good time to reformat this line and the loop
below.

> +        if ( iommu_intremap )
> +        {
> +            apic_pin_eoi[apic] = xmalloc_array(typeof(**apic_pin_eoi),
> +                                               nr_ioapic_entries[apic]);
> +            BUG_ON(!apic_pin_eoi[apic]);
> +
> +            for ( pin = 0; pin < nr_ioapic_entries[apic]; pin++ )
> +                apic_pin_eoi[apic][pin] = IRQ_VECTOR_UNASSIGNED;
> +        }
> +

Rather than doing this, we could have a single allocation for everything, and
store the different bases accounting for the number of pins of each IOAPIC.

    apic_pin_eoi[0] = base;
    for_each_ioapic
        apic_pin_eoi[i+1] = apic_pin_eoi[i] + nr_ioapic_entries[i];

>          for (pin = 0; pin < nr_ioapic_entries[apic]; pin++) {
>              /*
>               * add it to the IO-APIC irq-routing table:

Cheers,
Alejandro
Andrew Cooper Oct. 21, 2024, 10:07 a.m. UTC | #3
On 21/10/2024 10:55 am, Alejandro Vallejo wrote:
> On Fri Oct 18, 2024 at 9:08 AM BST, Roger Pau Monne wrote:
>> When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
>> repurposed to contain part of the offset into the remapping table.  Previous to
> For my own education. Is that really a repurpose? Isn't the RTE vector field
> itself simply remapped, just like any MSI?

Yes, it really is repurposed.  The vector field has a different meaning
under IR, and indeed different meanings between Intel and AMD.

The way you're supposed to use interrupt remapping is to set up a static
configuration in the device (inc IO-APIC in this case), and then (only)
edit the IO-RTE in the IOMMU to change destination/vector.

This avoids needing to do CFG/MMIO cycles to move interrupt affinity,
not to mention the corner cases involved with doing so.

~Andrew
Roger Pau Monné Oct. 21, 2024, 10:49 a.m. UTC | #4
On Mon, Oct 21, 2024 at 10:55:54AM +0100, Alejandro Vallejo wrote:
> On Fri Oct 18, 2024 at 9:08 AM BST, Roger Pau Monne wrote:
> > When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
> > repurposed to contain part of the offset into the remapping table.  Previous to
> 
> For my own education. Is that really a repurpose? Isn't the RTE vector field
> itself simply remapped, just like any MSI?

Well, the vector field no longer stores a vector, but an offset into
the Interrupt Remapping table.

> > 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
> > table would match the vector.  Such logic was mandatory for end of interrupt to
> > work, since the vector field (even when not containing a vector) is used by the
> > IO-APIC to find for which pin the EOI must be performed.
> >
> > Introduce a table to store the EOI handlers when using interrupt remapping, so
> 
> The table seems to store the pre-IR vectors. Is this a matter of nomenclature
> or leftover from a previous implementation?

IR doesn't change the vector, so pre-IR and post-IR vectors are the
same.

However, the table stores the value of 'raw' IO-APIC RTEs, which would
be the RTEs as written by the IOMMU code (post-IR).  See how IOMMU
code calls __ioapic_write_entry() to update the IO-APIC RTEs to use
the remapped format.

> > that the IO-APIC driver can translate pins into EOI handlers without having to
> > read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
> > unconditionally when interrupt remapping is enabled, even if strictly it would
> > only be required for AMD-Vi.
> 
> Given that last statement it might be worth mentioning that the table is
> bypassed when IR is off as well.

Sure, that's fine to add.

> >
> > Reported-by: Willi Junga <xenproject@ymy.be>
> > Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
> > Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
> > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > ---
> >  xen/arch/x86/io_apic.c | 47 ++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 47 insertions(+)
> >
> > diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
> > index e40d2f7dbd75..8856eb29d275 100644
> > --- a/xen/arch/x86/io_apic.c
> > +++ b/xen/arch/x86/io_apic.c
> > @@ -71,6 +71,22 @@ static int apic_pin_2_gsi_irq(int apic, int pin);
> >  
> >  static vmask_t *__read_mostly vector_map[MAX_IO_APICS];
> >  
> > +/*
> > + * Store the EOI handle when using interrupt remapping.
> 
> That explains the when, but not the what. This is "a LUT from IOAPIC pin to its
> vector field", as far as I can see. 

Well, it's the vector field when not using the remapped format, it's
no longer a vector field when using IR on AMD-Vi.  Hence why I've
named it "EOI handle".

> The order in which it's meant to be indexed would be a good addition here as
> well. I had to scroll down to see how it was used to really see what this was.

It's [apic][pin] matrix.  It's quite common in the IO-APIC code, I didn't
want to make the comment to verbose, but can certainly add to it.

> > + *
> > + * If using AMD-Vi interrupt remapping the IO-APIC redirection entry remapped
> > + * format repurposes the vector field to store the offset into the Interrupt
> > + * Remap table.  This causes directed EOI to longer work, as the CPU vector no
> > + * longer matches the contents of the RTE vector field.  Add a translation
> > + * table so that directed EOI uses the value in the RTE vector field when
> 
> nit: Might be worth mentioning that it's a merely cache and is populated
> on-demand from authoritative state in the IOAPIC.
> 
> > + * interrupt remapping is enabled.
> > + *
> > + * Note Intel VT-d Xen code still stores the CPU vector in the RTE vector field
> > + * when using the remapped format, but use the translation table uniformly in
> > + * order to avoid extra logic to differentiate between VT-d and AMD-Vi.
> > + */
> > +static unsigned int **apic_pin_eoi;
> 
> This should be signed to allow IRQ_VECTOR_UNASSIGNED, I think. Possibly
> int16_t, matching arch_irq_desc->vector. This raises doubts about the existing
> vectors here typed as unsigned too.

It's -1 which will be ~0, certainly out of the scope of the vectors
range.

The coding style in Xen is to not use fixed width integers unless
strictly necessary (iow: when representing register values for
example).  I don't think it's strictly required here to use a
fixed-width type.

> 
> On naming, I'd rather see ioapic rather than apic, but that's a an existing sin
> in the whole file. Otherwise, while it's used for EOI ATM, isn't it really just
> an ioapic_pin_vector?

As said above - using 'vector' when using AMD-Vi RTE remapped format is
not accurate IMO.

> > +
> >  static void share_vector_maps(unsigned int src, unsigned int dst)
> >  {
> >      unsigned int pin;
> > @@ -273,6 +289,13 @@ void __ioapic_write_entry(
> >      {
> >          __io_apic_write(apic, 0x11 + 2 * pin, eu.w2);
> >          __io_apic_write(apic, 0x10 + 2 * pin, eu.w1);
> > +        /*
> > +         * Might be called before apic_pin_eoi is allocated.  Entry will be
> > +         * updated once the array is allocated and there's an EOI or write
> > +         * against the pin.
> > +         */
> > +        if ( apic_pin_eoi )
> > +            apic_pin_eoi[apic][pin] = e.vector;
> >      }
> >      else
> >          iommu_update_ire_from_apic(apic, pin, e.raw);
> > @@ -298,9 +321,17 @@ static void __io_apic_eoi(unsigned int apic, unsigned int vector, unsigned int p
> 
> Out of curiosity, how could this vector come to be unassigned as a parameter?
> The existing code seems to assume that may happen.

I think it's possible that some IO-APIC pins are configured before Xen
is started, in which case Xen would need to deal with them.  I didn't
want to break that assumption anyway, if we want to get rid of this
case it should be a separate change.

> >      /* Prefer the use of the EOI register if available */
> >      if ( ioapic_has_eoi_reg(apic) )
> >      {
> > +        if ( apic_pin_eoi )
> > +            vector = apic_pin_eoi[apic][pin];
> > +
> >          /* If vector is unknown, read it from the IO-APIC */
> >          if ( vector == IRQ_VECTOR_UNASSIGNED )
> > +        {
> >              vector = __ioapic_read_entry(apic, pin, true).vector;
> > +            if ( apic_pin_eoi )
> > +                /* Update cached value so further EOI don't need to fetch it. */
> > +                apic_pin_eoi[apic][pin] = vector;
> > +        }
> >  
> >          *(IO_APIC_BASE(apic)+16) = vector;
> >      }
> > @@ -1022,7 +1053,23 @@ static void __init setup_IO_APIC_irqs(void)
> >  
> >      apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
> >  
> > +    if ( iommu_intremap )
> > +    {
> > +        apic_pin_eoi = xzalloc_array(typeof(*apic_pin_eoi), nr_ioapics);
> > +        BUG_ON(!apic_pin_eoi);
> > +    }
> > +
> >      for (apic = 0; apic < nr_ioapics; apic++) {
> 
> Was here before, but it might be a good time to reformat this line and the loop
> below.
> 
> > +        if ( iommu_intremap )
> > +        {
> > +            apic_pin_eoi[apic] = xmalloc_array(typeof(**apic_pin_eoi),
> > +                                               nr_ioapic_entries[apic]);
> > +            BUG_ON(!apic_pin_eoi[apic]);
> > +
> > +            for ( pin = 0; pin < nr_ioapic_entries[apic]; pin++ )
> > +                apic_pin_eoi[apic][pin] = IRQ_VECTOR_UNASSIGNED;
> > +        }
> > +
> 
> Rather than doing this, we could have a single allocation for everything, and
> store the different bases accounting for the number of pins of each IOAPIC.

Could do, overall it seems to make the logic more complicated than
strictly needed.  The allocation is done exclusively once at boot, and
hence doing a single one or possibly 4 or 5 different ones doesn't
seem worth it.  There are not that many IO-APICs on a system.

Thanks, Roger.
Andrew Cooper Oct. 21, 2024, 11:10 a.m. UTC | #5
On 18/10/2024 9:08 am, Roger Pau Monne wrote:
> When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
> repurposed to contain part of the offset into the remapping table.  Previous to
> 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
> table would match the vector.  Such logic was mandatory for end of interrupt to
> work, since the vector field (even when not containing a vector) is used by the
> IO-APIC to find for which pin the EOI must be performed.
>
> Introduce a table to store the EOI handlers when using interrupt remapping, so
> that the IO-APIC driver can translate pins into EOI handlers without having to
> read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
> unconditionally when interrupt remapping is enabled, even if strictly it would
> only be required for AMD-Vi.
>
> Reported-by: Willi Junga <xenproject@ymy.be>
> Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
> Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>

Yet more fallout from the multi-MSI work.  That really has been a giant
source of bugs.

> ---
>  xen/arch/x86/io_apic.c | 47 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 47 insertions(+)
>
> diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
> index e40d2f7dbd75..8856eb29d275 100644
> --- a/xen/arch/x86/io_apic.c
> +++ b/xen/arch/x86/io_apic.c
> @@ -71,6 +71,22 @@ static int apic_pin_2_gsi_irq(int apic, int pin);
>  
>  static vmask_t *__read_mostly vector_map[MAX_IO_APICS];
>  
> +/*
> + * Store the EOI handle when using interrupt remapping.
> + *
> + * If using AMD-Vi interrupt remapping the IO-APIC redirection entry remapped
> + * format repurposes the vector field to store the offset into the Interrupt
> + * Remap table.  This causes directed EOI to longer work, as the CPU vector no
> + * longer matches the contents of the RTE vector field.  Add a translation
> + * table so that directed EOI uses the value in the RTE vector field when
> + * interrupt remapping is enabled.
> + *
> + * Note Intel VT-d Xen code still stores the CPU vector in the RTE vector field
> + * when using the remapped format, but use the translation table uniformly in
> + * order to avoid extra logic to differentiate between VT-d and AMD-Vi.
> + */
> +static unsigned int **apic_pin_eoi;

I think we can get away with this being uint8_t rather than unsigned
int, especially as we're allocating memory when not strictly necessary.

The only sentinel value we use is IRQ_VECTOR_UNASSIGNED which is -1.

Vector 0xff is strictly SPIV and not allocated for anything else, so can
be reused as a suitable sentinel here.

> +
>  static void share_vector_maps(unsigned int src, unsigned int dst)
>  {
>      unsigned int pin;
> @@ -273,6 +289,13 @@ void __ioapic_write_entry(
>      {
>          __io_apic_write(apic, 0x11 + 2 * pin, eu.w2);
>          __io_apic_write(apic, 0x10 + 2 * pin, eu.w1);
> +        /*
> +         * Might be called before apic_pin_eoi is allocated.  Entry will be
> +         * updated once the array is allocated and there's an EOI or write
> +         * against the pin.
> +         */

Is this for the xAPIC path where we turn on interrupts before the IOMMU ?

> +        if ( apic_pin_eoi )
> +            apic_pin_eoi[apic][pin] = e.vector;
>      }
>      else
>          iommu_update_ire_from_apic(apic, pin, e.raw);
> @@ -298,9 +321,17 @@ static void __io_apic_eoi(unsigned int apic, unsigned int vector, unsigned int p
>      /* Prefer the use of the EOI register if available */
>      if ( ioapic_has_eoi_reg(apic) )
>      {
> +        if ( apic_pin_eoi )
> +            vector = apic_pin_eoi[apic][pin];
> +
>          /* If vector is unknown, read it from the IO-APIC */
>          if ( vector == IRQ_VECTOR_UNASSIGNED )
> +        {
>              vector = __ioapic_read_entry(apic, pin, true).vector;
> +            if ( apic_pin_eoi )
> +                /* Update cached value so further EOI don't need to fetch it. */
> +                apic_pin_eoi[apic][pin] = vector;
> +        }
>  
>          *(IO_APIC_BASE(apic)+16) = vector;
>      }
> @@ -1022,7 +1053,23 @@ static void __init setup_IO_APIC_irqs(void)
>  
>      apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
>  
> +    if ( iommu_intremap )

MISRA requires this to be iommu_intremap != iommu_intremap_off.

But, if this safe on older hardware?  iommu_intremap defaults to on
(full), and is then turned off later on boot for various reasons.

We do all memory allocations in setup_IO_APIC_irqs() so at least we get
to see a consistent view of iommu_intremap.

I suppose there's nothing wrong with having an extra cache of the vector
in the way when not using interrupt remapping, so maybe it's fine?

> +    {
> +        apic_pin_eoi = xzalloc_array(typeof(*apic_pin_eoi), nr_ioapics);
> +        BUG_ON(!apic_pin_eoi);
> +    }
> +
>      for (apic = 0; apic < nr_ioapics; apic++) {
> +        if ( iommu_intremap )
> +        {
> +            apic_pin_eoi[apic] = xmalloc_array(typeof(**apic_pin_eoi),
> +                                               nr_ioapic_entries[apic]);
> +            BUG_ON(!apic_pin_eoi[apic]);
> +
> +            for ( pin = 0; pin < nr_ioapic_entries[apic]; pin++ )
> +                apic_pin_eoi[apic][pin] = IRQ_VECTOR_UNASSIGNED;
> +        }

This logic will be better if you pull nr_ioapic_entries[apic] out into a
loop-local variable.

It should also allow the optimiser to turn the for loop into a memset(),
which it can't now because of possible pointer aliasing with the
induction variable.

But overall, the patch looks broadly ok to me.

~Andrew
David Woodhouse Oct. 21, 2024, 11:32 a.m. UTC | #6
On Mon, 2024-10-21 at 10:55 +0100, Alejandro Vallejo wrote:
> On Fri Oct 18, 2024 at 9:08 AM BST, Roger Pau Monne wrote:
> > When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
> > repurposed to contain part of the offset into the remapping table.  Previous to
> 
> For my own education....

Careful what you wish for.

http://david.woodhou.se/more-than-you-ever-wanted-to-know-about-x86-msis.txt
David Woodhouse Oct. 21, 2024, 11:34 a.m. UTC | #7
On Fri, 2024-10-18 at 10:08 +0200, Roger Pau Monne wrote:
> When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
> repurposed to contain part of the offset into the remapping table.  Previous to
> 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
> table would match the vector.  Such logic was mandatory for end of interrupt to
> work, since the vector field (even when not containing a vector) is used by the
> IO-APIC to find for which pin the EOI must be performed.
> 
> Introduce a table to store the EOI handlers when using interrupt remapping, so
> that the IO-APIC driver can translate pins into EOI handlers without having to
> read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
> unconditionally when interrupt remapping is enabled, even if strictly it would
> only be required for AMD-Vi.
> 
> Reported-by: Willi Junga <xenproject@ymy.be>
> Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
> Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>

Hm, couldn't we just have used the pin#?

The AMD IOMMU has per-device IRTE, so you *know* you can just use IRTE
indices 0-23 for the I/O APIC pins.
Andrew Cooper Oct. 21, 2024, 11:38 a.m. UTC | #8
On 21/10/2024 12:10 pm, Andrew Cooper wrote:
> On 18/10/2024 9:08 am, Roger Pau Monne wrote:
>> When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
>> repurposed to contain part of the offset into the remapping table.  Previous to
>> 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
>> table would match the vector.  Such logic was mandatory for end of interrupt to
>> work, since the vector field (even when not containing a vector) is used by the
>> IO-APIC to find for which pin the EOI must be performed.
>>
>> Introduce a table to store the EOI handlers when using interrupt remapping, so
>> that the IO-APIC driver can translate pins into EOI handlers without having to
>> read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
>> unconditionally when interrupt remapping is enabled, even if strictly it would
>> only be required for AMD-Vi.
>>
>> Reported-by: Willi Junga <xenproject@ymy.be>
>> Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
>> Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
>> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> Yet more fallout from the multi-MSI work.  That really has been a giant
> source of bugs.
>
>> ---
>>  xen/arch/x86/io_apic.c | 47 ++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 47 insertions(+)
>>
>> diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
>> index e40d2f7dbd75..8856eb29d275 100644
>> --- a/xen/arch/x86/io_apic.c
>> +++ b/xen/arch/x86/io_apic.c
>> @@ -71,6 +71,22 @@ static int apic_pin_2_gsi_irq(int apic, int pin);
>>  
>>  static vmask_t *__read_mostly vector_map[MAX_IO_APICS];
>>  
>> +/*
>> + * Store the EOI handle when using interrupt remapping.
>> + *
>> + * If using AMD-Vi interrupt remapping the IO-APIC redirection entry remapped
>> + * format repurposes the vector field to store the offset into the Interrupt
>> + * Remap table.  This causes directed EOI to longer work, as the CPU vector no
>> + * longer matches the contents of the RTE vector field.  Add a translation
>> + * table so that directed EOI uses the value in the RTE vector field when
>> + * interrupt remapping is enabled.
>> + *
>> + * Note Intel VT-d Xen code still stores the CPU vector in the RTE vector field
>> + * when using the remapped format, but use the translation table uniformly in
>> + * order to avoid extra logic to differentiate between VT-d and AMD-Vi.
>> + */
>> +static unsigned int **apic_pin_eoi;
> I think we can get away with this being uint8_t rather than unsigned
> int, especially as we're allocating memory when not strictly necessary.
>
> The only sentinel value we use is IRQ_VECTOR_UNASSIGNED which is -1.
>
> Vector 0xff is strictly SPIV and not allocated for anything else, so can
> be reused as a suitable sentinel here.

Actually, vectors 0 thru 0x0f are also strictly invalid, and could be
used as sentinels.  That's probably better than trying to play integer
promotion games between IRQ_VECTOR_UNASSIGNED and uint8_t.

~Andrew
Woodhouse, David Oct. 21, 2024, 11:43 a.m. UTC | #9
On Sat, 2024-10-19 at 05:23 +0200, Marek Marczykowski-Górecki wrote:
> On Fri, Oct 18, 2024 at 10:08:13AM +0200, Roger Pau Monne wrote:
> > When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
> > repurposed to contain part of the offset into the remapping table.  Previous to
> > 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
> > table would match the vector.  Such logic was mandatory for end of interrupt to
> > work, since the vector field (even when not containing a vector) is used by the
> > IO-APIC to find for which pin the EOI must be performed.
> > 
> > Introduce a table to store the EOI handlers when using interrupt remapping, so
> > that the IO-APIC driver can translate pins into EOI handlers without having to
> > read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
> > unconditionally when interrupt remapping is enabled, even if strictly it would
> > only be required for AMD-Vi.
> > 
> > Reported-by: Willi Junga <xenproject@ymy.be>
> > Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
> > Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
> > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> 
> I can confirm it fixes touchpad issue on Framework 13 AMD,
> it works without ioapic_ack=new now, thanks!
> Tested-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>

Thanks for testing. But... why did this work with the auto-EOI? That
*should* have had exactly the same problem, surely? 

The problem fixed by this patch is that the directed EOI used the
actual vector# and *not* the bits that the I/O APIC *thinks* are the
vector#, which are actually the IRTE index#.

But if you let the CPU do its broadcast EOI then surely *that* is going
to use the actual vector# too, and have precisely the same problem?

If you use the code prior to this patch, *without* ioapic_ack=new (i.e.
the mode that was failing), what happens if you do this:

--- a/xen/arch/x86/apic.c
+++ b/xen/arch/x86/apic.c
@@ -595,7 +595,7 @@ void setup_local_APIC(void)
     /*
      * Enable directed EOI
      */
-    if ( directed_eoi_enabled )
+    if ( 0 && directed_eoi_enabled )
     {
         value |= APIC_SPIV_DIRECTED_EOI;
         apic_printk(APIC_VERBOSE, "Suppress EOI broadcast on CPU#%d\n",


I'm guessing that 'fixes' it too? In which case, it looks like AMD has
some undocumented hack in between its APIC and I/O APIC to let it
magically auto-EOI the correct pin somehow?

Adding Boris and x86@kernel.org to Cc... do we know anything about such
an AMD hack?
Amazon Development Centre (London) Ltd. Registered in England and Wales with registration number 04543232 with its registered office at 1 Principal Place, Worship Street, London EC2A 2FA, United Kingdom.
David Woodhouse Oct. 21, 2024, 11:49 a.m. UTC | #10
On Mon, 2024-10-21 at 12:38 +0100, Andrew Cooper wrote:
> On 21/10/2024 12:10 pm, Andrew Cooper wrote:
> > On 18/10/2024 9:08 am, Roger Pau Monne wrote:
> > 
> > > 
> > > +/*
> > > + * Store the EOI handle when using interrupt remapping.
> > > + *
> > > + * If using AMD-Vi interrupt remapping the IO-APIC redirection entry remapped
> > > + * format repurposes the vector field to store the offset into the Interrupt
> > > + * Remap table.  This causes directed EOI to longer work, as the CPU vector no
> > > + * longer matches the contents of the RTE vector field.  Add a translation
> > > + * table so that directed EOI uses the value in the RTE vector field when
> > > + * interrupt remapping is enabled.
> > > + *
> > > + * Note Intel VT-d Xen code still stores the CPU vector in the RTE vector field
> > > + * when using the remapped format, but use the translation table uniformly in
> > > + * order to avoid extra logic to differentiate between VT-d and AMD-Vi.
> > > + */
> > > +static unsigned int **apic_pin_eoi;
> > I think we can get away with this being uint8_t rather than unsigned
> > int, especially as we're allocating memory when not strictly necessary.
> > 
> > The only sentinel value we use is IRQ_VECTOR_UNASSIGNED which is -1.
> > 
> > Vector 0xff is strictly SPIV and not allocated for anything else, so can
> > be reused as a suitable sentinel here.
> 
> Actually, vectors 0 thru 0x0f are also strictly invalid, and could be
> used as sentinels.  That's probably better than trying to play integer
> promotion games between IRQ_VECTOR_UNASSIGNED and uint8_t.

I don't quite follow how you need a sentinel value. How could you ever
*not* know it, given that you have to write it to the RTE?

(And you should *also* just use the pin# like Linux does, as I said).
Andrew Cooper Oct. 21, 2024, 11:53 a.m. UTC | #11
On 21/10/2024 12:49 pm, David Woodhouse wrote:
> On Mon, 2024-10-21 at 12:38 +0100, Andrew Cooper wrote:
>> On 21/10/2024 12:10 pm, Andrew Cooper wrote:
>>> On 18/10/2024 9:08 am, Roger Pau Monne wrote:
>>>
>>>> +/*
>>>> + * Store the EOI handle when using interrupt remapping.
>>>> + *
>>>> + * If using AMD-Vi interrupt remapping the IO-APIC redirection entry remapped
>>>> + * format repurposes the vector field to store the offset into the Interrupt
>>>> + * Remap table.  This causes directed EOI to longer work, as the CPU vector no
>>>> + * longer matches the contents of the RTE vector field.  Add a translation
>>>> + * table so that directed EOI uses the value in the RTE vector field when
>>>> + * interrupt remapping is enabled.
>>>> + *
>>>> + * Note Intel VT-d Xen code still stores the CPU vector in the RTE vector field
>>>> + * when using the remapped format, but use the translation table uniformly in
>>>> + * order to avoid extra logic to differentiate between VT-d and AMD-Vi.
>>>> + */
>>>> +static unsigned int **apic_pin_eoi;
>>> I think we can get away with this being uint8_t rather than unsigned
>>> int, especially as we're allocating memory when not strictly necessary.
>>>
>>> The only sentinel value we use is IRQ_VECTOR_UNASSIGNED which is -1.
>>>
>>> Vector 0xff is strictly SPIV and not allocated for anything else, so can
>>> be reused as a suitable sentinel here.
>> Actually, vectors 0 thru 0x0f are also strictly invalid, and could be
>> used as sentinels.  That's probably better than trying to play integer
>> promotion games between IRQ_VECTOR_UNASSIGNED and uint8_t.
> I don't quite follow how you need a sentinel value. How could you ever
> *not* know it, given that you have to write it to the RTE?
>
> (And you should *also* just use the pin# like Linux does, as I said).

Because Xen is insane and, for non-x2APIC cases, sets the system up
normally and the turns the IOMMU on late.

This really does need deleting, and everything merging into the early path.

~Andrew
Roger Pau Monné Oct. 21, 2024, 11:57 a.m. UTC | #12
On Mon, Oct 21, 2024 at 12:10:14PM +0100, Andrew Cooper wrote:
> On 18/10/2024 9:08 am, Roger Pau Monne wrote:
> > When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
> > repurposed to contain part of the offset into the remapping table.  Previous to
> > 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
> > table would match the vector.  Such logic was mandatory for end of interrupt to
> > work, since the vector field (even when not containing a vector) is used by the
> > IO-APIC to find for which pin the EOI must be performed.
> >
> > Introduce a table to store the EOI handlers when using interrupt remapping, so
> > that the IO-APIC driver can translate pins into EOI handlers without having to
> > read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
> > unconditionally when interrupt remapping is enabled, even if strictly it would
> > only be required for AMD-Vi.
> >
> > Reported-by: Willi Junga <xenproject@ymy.be>
> > Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
> > Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
> > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> 
> Yet more fallout from the multi-MSI work.  That really has been a giant
> source of bugs.
> 
> > ---
> >  xen/arch/x86/io_apic.c | 47 ++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 47 insertions(+)
> >
> > diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
> > index e40d2f7dbd75..8856eb29d275 100644
> > --- a/xen/arch/x86/io_apic.c
> > +++ b/xen/arch/x86/io_apic.c
> > @@ -71,6 +71,22 @@ static int apic_pin_2_gsi_irq(int apic, int pin);
> >  
> >  static vmask_t *__read_mostly vector_map[MAX_IO_APICS];
> >  
> > +/*
> > + * Store the EOI handle when using interrupt remapping.
> > + *
> > + * If using AMD-Vi interrupt remapping the IO-APIC redirection entry remapped
> > + * format repurposes the vector field to store the offset into the Interrupt
> > + * Remap table.  This causes directed EOI to longer work, as the CPU vector no
> > + * longer matches the contents of the RTE vector field.  Add a translation
> > + * table so that directed EOI uses the value in the RTE vector field when
> > + * interrupt remapping is enabled.
> > + *
> > + * Note Intel VT-d Xen code still stores the CPU vector in the RTE vector field
> > + * when using the remapped format, but use the translation table uniformly in
> > + * order to avoid extra logic to differentiate between VT-d and AMD-Vi.
> > + */
> > +static unsigned int **apic_pin_eoi;
> 
> I think we can get away with this being uint8_t rather than unsigned
> int, especially as we're allocating memory when not strictly necessary.
> 
> The only sentinel value we use is IRQ_VECTOR_UNASSIGNED which is -1.
> 
> Vector 0xff is strictly SPIV and not allocated for anything else, so can
> be reused as a suitable sentinel here.

The coding style explicitly discourages using fixed width types unless
it's strictly necessary, I assume the usage here would be covered by
Xen caching a value of a hardware register field that has a
fixed-width size.

> > +
> >  static void share_vector_maps(unsigned int src, unsigned int dst)
> >  {
> >      unsigned int pin;
> > @@ -273,6 +289,13 @@ void __ioapic_write_entry(
> >      {
> >          __io_apic_write(apic, 0x11 + 2 * pin, eu.w2);
> >          __io_apic_write(apic, 0x10 + 2 * pin, eu.w1);
> > +        /*
> > +         * Might be called before apic_pin_eoi is allocated.  Entry will be
> > +         * updated once the array is allocated and there's an EOI or write
> > +         * against the pin.
> > +         */
> 
> Is this for the xAPIC path where we turn on interrupts before the IOMMU ?

It's for iommu_setup() -> iommu_hardware_setup() saving and restoring
the IO-APIC entries around enabling of interrupt remapping.  This is
done just ahead of smp_prepare_cpus() which is where
setup_IO_APIC_irqs() gets called.

> > +        if ( apic_pin_eoi )
> > +            apic_pin_eoi[apic][pin] = e.vector;
> >      }
> >      else
> >          iommu_update_ire_from_apic(apic, pin, e.raw);
> > @@ -298,9 +321,17 @@ static void __io_apic_eoi(unsigned int apic, unsigned int vector, unsigned int p
> >      /* Prefer the use of the EOI register if available */
> >      if ( ioapic_has_eoi_reg(apic) )
> >      {
> > +        if ( apic_pin_eoi )
> > +            vector = apic_pin_eoi[apic][pin];
> > +
> >          /* If vector is unknown, read it from the IO-APIC */
> >          if ( vector == IRQ_VECTOR_UNASSIGNED )
> > +        {
> >              vector = __ioapic_read_entry(apic, pin, true).vector;
> > +            if ( apic_pin_eoi )
> > +                /* Update cached value so further EOI don't need to fetch it. */
> > +                apic_pin_eoi[apic][pin] = vector;
> > +        }
> >  
> >          *(IO_APIC_BASE(apic)+16) = vector;
> >      }
> > @@ -1022,7 +1053,23 @@ static void __init setup_IO_APIC_irqs(void)
> >  
> >      apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
> >  
> > +    if ( iommu_intremap )
> 
> MISRA requires this to be iommu_intremap != iommu_intremap_off.
> 
> But, if this safe on older hardware?  iommu_intremap defaults to on
> (full), and is then turned off later on boot for various reasons.

I think it's fine because setup_IO_APIC_irqs() is strictly called
after iommu_setup(), so the value of iommu_intremap by that point
should reflect whether IR is enabled.

> We do all memory allocations in setup_IO_APIC_irqs() so at least we get
> to see a consistent view of iommu_intremap.
> 
> I suppose there's nothing wrong with having an extra cache of the vector
> in the way when not using interrupt remapping, so maybe it's fine?
> 
> > +    {
> > +        apic_pin_eoi = xzalloc_array(typeof(*apic_pin_eoi), nr_ioapics);
> > +        BUG_ON(!apic_pin_eoi);
> > +    }
> > +
> >      for (apic = 0; apic < nr_ioapics; apic++) {
> > +        if ( iommu_intremap )
> > +        {
> > +            apic_pin_eoi[apic] = xmalloc_array(typeof(**apic_pin_eoi),
> > +                                               nr_ioapic_entries[apic]);
> > +            BUG_ON(!apic_pin_eoi[apic]);
> > +
> > +            for ( pin = 0; pin < nr_ioapic_entries[apic]; pin++ )
> > +                apic_pin_eoi[apic][pin] = IRQ_VECTOR_UNASSIGNED;
> > +        }
> 
> This logic will be better if you pull nr_ioapic_entries[apic] out into a
> loop-local variable.
> 
> It should also allow the optimiser to turn the for loop into a memset(),
> which it can't now because of possible pointer aliasing with the
> induction variable.

Oh, OK, can send v2 with that adjusted.

> But overall, the patch looks broadly ok to me.

Thanks, Roger.
David Woodhouse Oct. 21, 2024, 12:02 p.m. UTC | #13
On Mon, 2024-10-21 at 12:53 +0100, Andrew Cooper wrote:
> 
> > I don't quite follow how you need a sentinel value. How could you ever
> > *not* know it, given that you have to write it to the RTE?
> > 
> > (And you should *also* just use the pin# like Linux does, as I said).
> 
> Because Xen is insane and, for non-x2APIC cases, sets the system up
> normally and the turns the IOMMU on late.
> 
> This really does need deleting, and everything merging into the early path.

Don't you still have to mask the interrupts when enabling the IOMMU and
then re-enable them by writing the new values to the RTE once remapping
is turned on? So at any given moment, surely it's still the case that
you know what was written to the RTE?

But OK, i don't really want to know... :)
Alejandro Vallejo Oct. 21, 2024, 12:18 p.m. UTC | #14
On Mon Oct 21, 2024 at 12:32 PM BST, David Woodhouse wrote:
> On Mon, 2024-10-21 at 10:55 +0100, Alejandro Vallejo wrote:
> > On Fri Oct 18, 2024 at 9:08 AM BST, Roger Pau Monne wrote:
> > > When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
> > > repurposed to contain part of the offset into the remapping table.  Previous to
> > 
> > For my own education....
>
> Careful what you wish for.
>
> http://david.woodhou.se/more-than-you-ever-wanted-to-know-about-x86-msis.txt

I had seen it before, but then neglected to give it the attentive read it very
much deserves. Let me correct that, thanks.

Cheers,
Alejandro
Andrew Cooper Oct. 21, 2024, 12:33 p.m. UTC | #15
On 21/10/2024 12:57 pm, Roger Pau Monné wrote:
> On Mon, Oct 21, 2024 at 12:10:14PM +0100, Andrew Cooper wrote:
>> On 18/10/2024 9:08 am, Roger Pau Monne wrote:
>>> When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
>>> repurposed to contain part of the offset into the remapping table.  Previous to
>>> 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
>>> table would match the vector.  Such logic was mandatory for end of interrupt to
>>> work, since the vector field (even when not containing a vector) is used by the
>>> IO-APIC to find for which pin the EOI must be performed.
>>>
>>> Introduce a table to store the EOI handlers when using interrupt remapping, so
>>> that the IO-APIC driver can translate pins into EOI handlers without having to
>>> read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
>>> unconditionally when interrupt remapping is enabled, even if strictly it would
>>> only be required for AMD-Vi.
>>>
>>> Reported-by: Willi Junga <xenproject@ymy.be>
>>> Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
>>> Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
>>> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
>> Yet more fallout from the multi-MSI work.  That really has been a giant
>> source of bugs.
>>
>>> ---
>>>  xen/arch/x86/io_apic.c | 47 ++++++++++++++++++++++++++++++++++++++++++
>>>  1 file changed, 47 insertions(+)
>>>
>>> diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
>>> index e40d2f7dbd75..8856eb29d275 100644
>>> --- a/xen/arch/x86/io_apic.c
>>> +++ b/xen/arch/x86/io_apic.c
>>> @@ -71,6 +71,22 @@ static int apic_pin_2_gsi_irq(int apic, int pin);
>>>  
>>>  static vmask_t *__read_mostly vector_map[MAX_IO_APICS];
>>>  
>>> +/*
>>> + * Store the EOI handle when using interrupt remapping.
>>> + *
>>> + * If using AMD-Vi interrupt remapping the IO-APIC redirection entry remapped
>>> + * format repurposes the vector field to store the offset into the Interrupt
>>> + * Remap table.  This causes directed EOI to longer work, as the CPU vector no
>>> + * longer matches the contents of the RTE vector field.  Add a translation
>>> + * table so that directed EOI uses the value in the RTE vector field when
>>> + * interrupt remapping is enabled.
>>> + *
>>> + * Note Intel VT-d Xen code still stores the CPU vector in the RTE vector field
>>> + * when using the remapped format, but use the translation table uniformly in
>>> + * order to avoid extra logic to differentiate between VT-d and AMD-Vi.
>>> + */
>>> +static unsigned int **apic_pin_eoi;
>> I think we can get away with this being uint8_t rather than unsigned
>> int, especially as we're allocating memory when not strictly necessary.
>>
>> The only sentinel value we use is IRQ_VECTOR_UNASSIGNED which is -1.
>>
>> Vector 0xff is strictly SPIV and not allocated for anything else, so can
>> be reused as a suitable sentinel here.
> The coding style explicitly discourages using fixed width types unless
> it's strictly necessary, I assume the usage here would be covered by
> Xen caching a value of a hardware register field that has a
> fixed-width size.

I'm >< this close to reverting that too.  It's not even self-consistent
as written, nonsense in some cases, and is being used as a whipping
stick to reject otherwise-ok patches, which is pure toxicity in the
community.

Not to mention that this rule is in contradiction to MISRA, and there's
no progress being made in that direction.


All variables should be of an appropriate type.

Sometimes that's a fixed width type, and sometimes it's not.  (And this
is what is impossible to dictate in CODING_STYLE.)

In this case, we're talking about a quantity which is strictly in the
range 0x10-0xfe, plus one sentinel.  There is a 4x difference in memory
allocated between unsigned int and uint8_t.

Given that part of the justification here is "allocate memory
unconditionally to simplify things", then a 4x reduction in allocated
memory is definitely a good thing.
>>> +        if ( apic_pin_eoi )
>>> +            apic_pin_eoi[apic][pin] = e.vector;
>>>      }
>>>      else
>>>          iommu_update_ire_from_apic(apic, pin, e.raw);
>>> @@ -298,9 +321,17 @@ static void __io_apic_eoi(unsigned int apic, unsigned int vector, unsigned int p
>>>      /* Prefer the use of the EOI register if available */
>>>      if ( ioapic_has_eoi_reg(apic) )
>>>      {
>>> +        if ( apic_pin_eoi )
>>> +            vector = apic_pin_eoi[apic][pin];
>>> +
>>>          /* If vector is unknown, read it from the IO-APIC */
>>>          if ( vector == IRQ_VECTOR_UNASSIGNED )
>>> +        {
>>>              vector = __ioapic_read_entry(apic, pin, true).vector;
>>> +            if ( apic_pin_eoi )
>>> +                /* Update cached value so further EOI don't need to fetch it. */
>>> +                apic_pin_eoi[apic][pin] = vector;
>>> +        }
>>>  
>>>          *(IO_APIC_BASE(apic)+16) = vector;
>>>      }
>>> @@ -1022,7 +1053,23 @@ static void __init setup_IO_APIC_irqs(void)
>>>  
>>>      apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
>>>  
>>> +    if ( iommu_intremap )
>> MISRA requires this to be iommu_intremap != iommu_intremap_off.
>>
>> But, if this safe on older hardware?  iommu_intremap defaults to on
>> (full), and is then turned off later on boot for various reasons.
> I think it's fine because setup_IO_APIC_irqs() is strictly called
> after iommu_setup(), so the value of iommu_intremap by that point
> should reflect whether IR is enabled.

Ok.  Can you note this in a comment?

~Andrew
Roger Pau Monné Oct. 21, 2024, 2:03 p.m. UTC | #16
On Mon, Oct 21, 2024 at 12:38:13PM +0100, Andrew Cooper wrote:
> On 21/10/2024 12:10 pm, Andrew Cooper wrote:
> > On 18/10/2024 9:08 am, Roger Pau Monne wrote:
> >> When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
> >> repurposed to contain part of the offset into the remapping table.  Previous to
> >> 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
> >> table would match the vector.  Such logic was mandatory for end of interrupt to
> >> work, since the vector field (even when not containing a vector) is used by the
> >> IO-APIC to find for which pin the EOI must be performed.
> >>
> >> Introduce a table to store the EOI handlers when using interrupt remapping, so
> >> that the IO-APIC driver can translate pins into EOI handlers without having to
> >> read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
> >> unconditionally when interrupt remapping is enabled, even if strictly it would
> >> only be required for AMD-Vi.
> >>
> >> Reported-by: Willi Junga <xenproject@ymy.be>
> >> Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
> >> Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
> >> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > Yet more fallout from the multi-MSI work.  That really has been a giant
> > source of bugs.
> >
> >> ---
> >>  xen/arch/x86/io_apic.c | 47 ++++++++++++++++++++++++++++++++++++++++++
> >>  1 file changed, 47 insertions(+)
> >>
> >> diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
> >> index e40d2f7dbd75..8856eb29d275 100644
> >> --- a/xen/arch/x86/io_apic.c
> >> +++ b/xen/arch/x86/io_apic.c
> >> @@ -71,6 +71,22 @@ static int apic_pin_2_gsi_irq(int apic, int pin);
> >>  
> >>  static vmask_t *__read_mostly vector_map[MAX_IO_APICS];
> >>  
> >> +/*
> >> + * Store the EOI handle when using interrupt remapping.
> >> + *
> >> + * If using AMD-Vi interrupt remapping the IO-APIC redirection entry remapped
> >> + * format repurposes the vector field to store the offset into the Interrupt
> >> + * Remap table.  This causes directed EOI to longer work, as the CPU vector no
> >> + * longer matches the contents of the RTE vector field.  Add a translation
> >> + * table so that directed EOI uses the value in the RTE vector field when
> >> + * interrupt remapping is enabled.
> >> + *
> >> + * Note Intel VT-d Xen code still stores the CPU vector in the RTE vector field
> >> + * when using the remapped format, but use the translation table uniformly in
> >> + * order to avoid extra logic to differentiate between VT-d and AMD-Vi.
> >> + */
> >> +static unsigned int **apic_pin_eoi;
> > I think we can get away with this being uint8_t rather than unsigned
> > int, especially as we're allocating memory when not strictly necessary.
> >
> > The only sentinel value we use is IRQ_VECTOR_UNASSIGNED which is -1.
> >
> > Vector 0xff is strictly SPIV and not allocated for anything else, so can
> > be reused as a suitable sentinel here.
> 
> Actually, vectors 0 thru 0x0f are also strictly invalid, and could be
> used as sentinels.  That's probably better than trying to play integer
> promotion games between IRQ_VECTOR_UNASSIGNED and uint8_t.

Hm, do you mean to change IRQ_VECTOR_UNASSIGNED to be 0 instead of -1?
Those vectors are equally invalid to be used for external interrupts,
and hence should never be in an irq_desc.

Otherwise just using 0 for the EOI table sentinel will make the code a
bit more ugly IMO:

if ( apic_pin_eoi )
    vector = apic_pin_eoi[apic][pin] ?: IRQ_VECTOR_UNASSIGNED;

/* If vector is unknown, read it from the IO-APIC */
if ( vector == IRQ_VECTOR_UNASSIGNED )
{
    vector = __ioapic_read_entry(apic, pin, true).vector;
    if ( apic_pin_eoi )
        /* Update cached value so further EOI don't need to fetch it. */
        apic_pin_eoi[apic][pin] = vector;
}

Thanks, Roger.
Roger Pau Monné Oct. 21, 2024, 2:06 p.m. UTC | #17
On Mon, Oct 21, 2024 at 12:34:37PM +0100, David Woodhouse wrote:
> On Fri, 2024-10-18 at 10:08 +0200, Roger Pau Monne wrote:
> > When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
> > repurposed to contain part of the offset into the remapping table.  Previous to
> > 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
> > table would match the vector.  Such logic was mandatory for end of interrupt to
> > work, since the vector field (even when not containing a vector) is used by the
> > IO-APIC to find for which pin the EOI must be performed.
> > 
> > Introduce a table to store the EOI handlers when using interrupt remapping, so
> > that the IO-APIC driver can translate pins into EOI handlers without having to
> > read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
> > unconditionally when interrupt remapping is enabled, even if strictly it would
> > only be required for AMD-Vi.
> > 
> > Reported-by: Willi Junga <xenproject@ymy.be>
> > Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
> > Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
> > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> 
> Hm, couldn't we just have used the pin#?

Yes, but that would require a much bigger change that what's currently
presented here, and for backport purposes I think it's better done
this way for fixing this specific bug.

Changing to use pin# as the IR offset is worthwhile, but IMO needs to
be done separated from the bugfix here.

> The AMD IOMMU has per-device IRTE, so you *know* you can just use IRTE
> indices 0-23 for the I/O APIC pins.

Aren't there IO-APICs with more than 24 pins?

Thanks, Roger.
Roger Pau Monné Oct. 21, 2024, 2:25 p.m. UTC | #18
On Mon, Oct 21, 2024 at 01:02:31PM +0100, David Woodhouse wrote:
> On Mon, 2024-10-21 at 12:53 +0100, Andrew Cooper wrote:
> > 
> > > I don't quite follow how you need a sentinel value. How could you ever
> > > *not* know it, given that you have to write it to the RTE?
> > > 
> > > (And you should *also* just use the pin# like Linux does, as I said).
> > 
> > Because Xen is insane and, for non-x2APIC cases, sets the system up
> > normally and the turns the IOMMU on late.
> > 
> > This really does need deleting, and everything merging into the early path.
> 
> Don't you still have to mask the interrupts when enabling the IOMMU and
> then re-enable them by writing the new values to the RTE once remapping
> is turned on? So at any given moment, surely it's still the case that
> you know what was written to the RTE?
> 
> But OK, i don't really want to know... :)

It's possible that __io_apic_eoi() gets called before the EOI handler
array is allocated, as part of clear_IO_APIC_pin() that is done ahead
setup_IO_APIC() (so with apic_pin_eoi == NULL).

Whether Xen can get into __io_apic_eoi() with the EOI handler array
allocated but some entries not initialized, it's not clear to me.
However I prefer to act on the safe side and allow the fallback of
fetching the field from the RTE itself.  This is a swamp I don't want
to drain right now (as I'm busy with other stuff).

Thanks, Roger.
Andrew Cooper Oct. 21, 2024, 2:51 p.m. UTC | #19
On 21/10/2024 3:06 pm, Roger Pau Monné wrote:
> On Mon, Oct 21, 2024 at 12:34:37PM +0100, David Woodhouse wrote:
>> On Fri, 2024-10-18 at 10:08 +0200, Roger Pau Monne wrote:
>>> When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
>>> repurposed to contain part of the offset into the remapping table.  Previous to
>>> 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
>>> table would match the vector.  Such logic was mandatory for end of interrupt to
>>> work, since the vector field (even when not containing a vector) is used by the
>>> IO-APIC to find for which pin the EOI must be performed.
>>>
>>> Introduce a table to store the EOI handlers when using interrupt remapping, so
>>> that the IO-APIC driver can translate pins into EOI handlers without having to
>>> read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
>>> unconditionally when interrupt remapping is enabled, even if strictly it would
>>> only be required for AMD-Vi.
>>>
>>> Reported-by: Willi Junga <xenproject@ymy.be>
>>> Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
>>> Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
>>> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
>> Hm, couldn't we just have used the pin#?
> Yes, but that would require a much bigger change that what's currently
> presented here, and for backport purposes I think it's better done
> this way for fixing this specific bug.
>
> Changing to use pin# as the IR offset is worthwhile, but IMO needs to
> be done separated from the bugfix here.
>
>> The AMD IOMMU has per-device IRTE, so you *know* you can just use IRTE
>> indices 0-23 for the I/O APIC pins.
> Aren't there IO-APICs with more than 24 pins?

Recent Intel SoCs have a single IO-APIC with 120 pins.

~Andrew
David Woodhouse Oct. 21, 2024, 2:54 p.m. UTC | #20
On Mon, 2024-10-21 at 15:51 +0100, Andrew Cooper wrote:
> On 21/10/2024 3:06 pm, Roger Pau Monné wrote:
> > On Mon, Oct 21, 2024 at 12:34:37PM +0100, David Woodhouse wrote:
> > > On Fri, 2024-10-18 at 10:08 +0200, Roger Pau Monne wrote:
> > > > When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
> > > > repurposed to contain part of the offset into the remapping table.  Previous to
> > > > 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
> > > > table would match the vector.  Such logic was mandatory for end of interrupt to
> > > > work, since the vector field (even when not containing a vector) is used by the
> > > > IO-APIC to find for which pin the EOI must be performed.
> > > > 
> > > > Introduce a table to store the EOI handlers when using interrupt remapping, so
> > > > that the IO-APIC driver can translate pins into EOI handlers without having to
> > > > read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
> > > > unconditionally when interrupt remapping is enabled, even if strictly it would
> > > > only be required for AMD-Vi.
> > > > 
> > > > Reported-by: Willi Junga <xenproject@ymy.be>
> > > > Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
> > > > Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
> > > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > > Hm, couldn't we just have used the pin#?
> > Yes, but that would require a much bigger change that what's currently
> > presented here, and for backport purposes I think it's better done
> > this way for fixing this specific bug.
> > 
> > Changing to use pin# as the IR offset is worthwhile, but IMO needs to
> > be done separated from the bugfix here.
> > 
> > > The AMD IOMMU has per-device IRTE, so you *know* you can just use IRTE
> > > indices 0-23 for the I/O APIC pins.
> > Aren't there IO-APICs with more than 24 pins?
> 
> Recent Intel SoCs have a single IO-APIC with 120 pins.

And Xen offers a 32-pin one to guests IIRC. I should have said. 'you
can just use IRTE indices 0-(N-1) for the I/O APIC pins'.

The point is the IRTE is per-device, unless the platform has more than
one I/O APIC with the *same* requester-id.
Roger Pau Monné Oct. 21, 2024, 3 p.m. UTC | #21
On Mon, Oct 21, 2024 at 03:54:35PM +0100, David Woodhouse wrote:
> On Mon, 2024-10-21 at 15:51 +0100, Andrew Cooper wrote:
> > On 21/10/2024 3:06 pm, Roger Pau Monné wrote:
> > > On Mon, Oct 21, 2024 at 12:34:37PM +0100, David Woodhouse wrote:
> > > > On Fri, 2024-10-18 at 10:08 +0200, Roger Pau Monne wrote:
> > > > > When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
> > > > > repurposed to contain part of the offset into the remapping table.  Previous to
> > > > > 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
> > > > > table would match the vector.  Such logic was mandatory for end of interrupt to
> > > > > work, since the vector field (even when not containing a vector) is used by the
> > > > > IO-APIC to find for which pin the EOI must be performed.
> > > > > 
> > > > > Introduce a table to store the EOI handlers when using interrupt remapping, so
> > > > > that the IO-APIC driver can translate pins into EOI handlers without having to
> > > > > read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
> > > > > unconditionally when interrupt remapping is enabled, even if strictly it would
> > > > > only be required for AMD-Vi.
> > > > > 
> > > > > Reported-by: Willi Junga <xenproject@ymy.be>
> > > > > Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
> > > > > Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
> > > > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > > > Hm, couldn't we just have used the pin#?
> > > Yes, but that would require a much bigger change that what's currently
> > > presented here, and for backport purposes I think it's better done
> > > this way for fixing this specific bug.
> > > 
> > > Changing to use pin# as the IR offset is worthwhile, but IMO needs to
> > > be done separated from the bugfix here.
> > > 
> > > > The AMD IOMMU has per-device IRTE, so you *know* you can just use IRTE
> > > > indices 0-23 for the I/O APIC pins.
> > > Aren't there IO-APICs with more than 24 pins?
> > 
> > Recent Intel SoCs have a single IO-APIC with 120 pins.
> 
> And Xen offers a 32-pin one to guests IIRC. I should have said. 'you
> can just use IRTE indices 0-(N-1) for the I/O APIC pins'.

Indeed, my comment was about the hardcoding of 24 pins, as I recall
seeing IO-APICs with more pins.

> The point is the IRTE is per-device, unless the platform has more than
> one I/O APIC with the *same* requester-id.

Yup, just wanted to clarify whether there was a reason for you to
mention 0-23 explicitly, didn't mean to be pedantic (but possibly
sounded like that).

Thanks, Roger
Alejandro Vallejo Oct. 21, 2024, 3:03 p.m. UTC | #22
On Mon Oct 21, 2024 at 3:51 PM BST, Andrew Cooper wrote:
> On 21/10/2024 3:06 pm, Roger Pau Monné wrote:
> > On Mon, Oct 21, 2024 at 12:34:37PM +0100, David Woodhouse wrote:
> >> On Fri, 2024-10-18 at 10:08 +0200, Roger Pau Monne wrote:
> >>> When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
> >>> repurposed to contain part of the offset into the remapping table.  Previous to
> >>> 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
> >>> table would match the vector.  Such logic was mandatory for end of interrupt to
> >>> work, since the vector field (even when not containing a vector) is used by the
> >>> IO-APIC to find for which pin the EOI must be performed.
> >>>
> >>> Introduce a table to store the EOI handlers when using interrupt remapping, so
> >>> that the IO-APIC driver can translate pins into EOI handlers without having to
> >>> read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
> >>> unconditionally when interrupt remapping is enabled, even if strictly it would
> >>> only be required for AMD-Vi.
> >>>
> >>> Reported-by: Willi Junga <xenproject@ymy.be>
> >>> Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
> >>> Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
> >>> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> >> Hm, couldn't we just have used the pin#?
> > Yes, but that would require a much bigger change that what's currently
> > presented here, and for backport purposes I think it's better done
> > this way for fixing this specific bug.
> >
> > Changing to use pin# as the IR offset is worthwhile, but IMO needs to
> > be done separated from the bugfix here.
> >
> >> The AMD IOMMU has per-device IRTE, so you *know* you can just use IRTE
> >> indices 0-23 for the I/O APIC pins.
> > Aren't there IO-APICs with more than 24 pins?
>
> Recent Intel SoCs have a single IO-APIC with 120 pins.
>
> ~Andrew

I can't say I understand why though.

In practice you have the legacy ISA IRQs and the 4 legacy PCI INTx. If you have
a weird enough system you might have more than one PCIe bus, but even that fits
more than nicely in 24 "pins". Does ACPI give more than 4 IRQs these days after
an adequate blood sacrifice to the gods of AML?

Cheers,
Alejandro
Andrew Cooper Oct. 21, 2024, 3:08 p.m. UTC | #23
On 21/10/2024 4:03 pm, Alejandro Vallejo wrote:
> On Mon Oct 21, 2024 at 3:51 PM BST, Andrew Cooper wrote:
>> On 21/10/2024 3:06 pm, Roger Pau Monné wrote:
>>> On Mon, Oct 21, 2024 at 12:34:37PM +0100, David Woodhouse wrote:
>>>> On Fri, 2024-10-18 at 10:08 +0200, Roger Pau Monne wrote:
>>>>> When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
>>>>> repurposed to contain part of the offset into the remapping table.  Previous to
>>>>> 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
>>>>> table would match the vector.  Such logic was mandatory for end of interrupt to
>>>>> work, since the vector field (even when not containing a vector) is used by the
>>>>> IO-APIC to find for which pin the EOI must be performed.
>>>>>
>>>>> Introduce a table to store the EOI handlers when using interrupt remapping, so
>>>>> that the IO-APIC driver can translate pins into EOI handlers without having to
>>>>> read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
>>>>> unconditionally when interrupt remapping is enabled, even if strictly it would
>>>>> only be required for AMD-Vi.
>>>>>
>>>>> Reported-by: Willi Junga <xenproject@ymy.be>
>>>>> Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
>>>>> Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
>>>>> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
>>>> Hm, couldn't we just have used the pin#?
>>> Yes, but that would require a much bigger change that what's currently
>>> presented here, and for backport purposes I think it's better done
>>> this way for fixing this specific bug.
>>>
>>> Changing to use pin# as the IR offset is worthwhile, but IMO needs to
>>> be done separated from the bugfix here.
>>>
>>>> The AMD IOMMU has per-device IRTE, so you *know* you can just use IRTE
>>>> indices 0-23 for the I/O APIC pins.
>>> Aren't there IO-APICs with more than 24 pins?
>> Recent Intel SoCs have a single IO-APIC with 120 pins.
>>
>> ~Andrew
> I can't say I understand why though.
>
> In practice you have the legacy ISA IRQs and the 4 legacy PCI INTx. If you have
> a weird enough system you might have more than one PCIe bus, but even that fits
> more than nicely in 24 "pins". Does ACPI give more than 4 IRQs these days after
> an adequate blood sacrifice to the gods of AML?

There's a whole bunch of pins for misc non-PCI(e) things, including
plain GPIO lines.

~Andrew
Roger Pau Monné Oct. 21, 2024, 5 p.m. UTC | #24
On Mon, Oct 21, 2024 at 12:38:13PM +0100, Andrew Cooper wrote:
> On 21/10/2024 12:10 pm, Andrew Cooper wrote:
> > On 18/10/2024 9:08 am, Roger Pau Monne wrote:
> >> When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
> >> repurposed to contain part of the offset into the remapping table.  Previous to
> >> 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
> >> table would match the vector.  Such logic was mandatory for end of interrupt to
> >> work, since the vector field (even when not containing a vector) is used by the
> >> IO-APIC to find for which pin the EOI must be performed.
> >>
> >> Introduce a table to store the EOI handlers when using interrupt remapping, so
> >> that the IO-APIC driver can translate pins into EOI handlers without having to
> >> read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
> >> unconditionally when interrupt remapping is enabled, even if strictly it would
> >> only be required for AMD-Vi.
> >>
> >> Reported-by: Willi Junga <xenproject@ymy.be>
> >> Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
> >> Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
> >> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > Yet more fallout from the multi-MSI work.  That really has been a giant
> > source of bugs.
> >
> >> ---
> >>  xen/arch/x86/io_apic.c | 47 ++++++++++++++++++++++++++++++++++++++++++
> >>  1 file changed, 47 insertions(+)
> >>
> >> diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
> >> index e40d2f7dbd75..8856eb29d275 100644
> >> --- a/xen/arch/x86/io_apic.c
> >> +++ b/xen/arch/x86/io_apic.c
> >> @@ -71,6 +71,22 @@ static int apic_pin_2_gsi_irq(int apic, int pin);
> >>  
> >>  static vmask_t *__read_mostly vector_map[MAX_IO_APICS];
> >>  
> >> +/*
> >> + * Store the EOI handle when using interrupt remapping.
> >> + *
> >> + * If using AMD-Vi interrupt remapping the IO-APIC redirection entry remapped
> >> + * format repurposes the vector field to store the offset into the Interrupt
> >> + * Remap table.  This causes directed EOI to longer work, as the CPU vector no
> >> + * longer matches the contents of the RTE vector field.  Add a translation
> >> + * table so that directed EOI uses the value in the RTE vector field when
> >> + * interrupt remapping is enabled.
> >> + *
> >> + * Note Intel VT-d Xen code still stores the CPU vector in the RTE vector field
> >> + * when using the remapped format, but use the translation table uniformly in
> >> + * order to avoid extra logic to differentiate between VT-d and AMD-Vi.
> >> + */
> >> +static unsigned int **apic_pin_eoi;
> > I think we can get away with this being uint8_t rather than unsigned
> > int, especially as we're allocating memory when not strictly necessary.
> >
> > The only sentinel value we use is IRQ_VECTOR_UNASSIGNED which is -1.
> >
> > Vector 0xff is strictly SPIV and not allocated for anything else, so can
> > be reused as a suitable sentinel here.
> 
> Actually, vectors 0 thru 0x0f are also strictly invalid, and could be
> used as sentinels.  That's probably better than trying to play integer
> promotion games between IRQ_VECTOR_UNASSIGNED and uint8_t.

I've been giving some thought about this further, and I don't think
the above is accurate.  While vectors 0 thru 0x0f are strictly
invalid, the EOI handle in AMD-Vi is not a vector, but an offset into
the IR table.  Hence the range of valid handles is 0 to 0xff.

So the type of apic_pin_eoi needs to account for 0 to 0xff plus one
sentinel.  We could use uint16_t or int16_t, but at that point it
might be better to just use unsigned int?

Thanks, Roger.
Andrew Cooper Oct. 21, 2024, 5:21 p.m. UTC | #25
On 21/10/2024 6:00 pm, Roger Pau Monné wrote:
> On Mon, Oct 21, 2024 at 12:38:13PM +0100, Andrew Cooper wrote:
>> On 21/10/2024 12:10 pm, Andrew Cooper wrote:
>>> On 18/10/2024 9:08 am, Roger Pau Monne wrote:
>>>> When using AMD-VI interrupt remapping the vector field in the IO-APIC RTE is
>>>> repurposed to contain part of the offset into the remapping table.  Previous to
>>>> 2ca9fbd739b8 Xen had logic so that the offset into the interrupt remapping
>>>> table would match the vector.  Such logic was mandatory for end of interrupt to
>>>> work, since the vector field (even when not containing a vector) is used by the
>>>> IO-APIC to find for which pin the EOI must be performed.
>>>>
>>>> Introduce a table to store the EOI handlers when using interrupt remapping, so
>>>> that the IO-APIC driver can translate pins into EOI handlers without having to
>>>> read the IO-APIC RTE entry.  Note that to simplify the logic such table is used
>>>> unconditionally when interrupt remapping is enabled, even if strictly it would
>>>> only be required for AMD-Vi.
>>>>
>>>> Reported-by: Willi Junga <xenproject@ymy.be>
>>>> Suggested-by: David Woodhouse <dwmw@amazon.co.uk>
>>>> Fixes: 2ca9fbd739b8 ('AMD IOMMU: allocate IRTE entries instead of using a static mapping')
>>>> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
>>> Yet more fallout from the multi-MSI work.  That really has been a giant
>>> source of bugs.
>>>
>>>> ---
>>>>  xen/arch/x86/io_apic.c | 47 ++++++++++++++++++++++++++++++++++++++++++
>>>>  1 file changed, 47 insertions(+)
>>>>
>>>> diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
>>>> index e40d2f7dbd75..8856eb29d275 100644
>>>> --- a/xen/arch/x86/io_apic.c
>>>> +++ b/xen/arch/x86/io_apic.c
>>>> @@ -71,6 +71,22 @@ static int apic_pin_2_gsi_irq(int apic, int pin);
>>>>  
>>>>  static vmask_t *__read_mostly vector_map[MAX_IO_APICS];
>>>>  
>>>> +/*
>>>> + * Store the EOI handle when using interrupt remapping.
>>>> + *
>>>> + * If using AMD-Vi interrupt remapping the IO-APIC redirection entry remapped
>>>> + * format repurposes the vector field to store the offset into the Interrupt
>>>> + * Remap table.  This causes directed EOI to longer work, as the CPU vector no
>>>> + * longer matches the contents of the RTE vector field.  Add a translation
>>>> + * table so that directed EOI uses the value in the RTE vector field when
>>>> + * interrupt remapping is enabled.
>>>> + *
>>>> + * Note Intel VT-d Xen code still stores the CPU vector in the RTE vector field
>>>> + * when using the remapped format, but use the translation table uniformly in
>>>> + * order to avoid extra logic to differentiate between VT-d and AMD-Vi.
>>>> + */
>>>> +static unsigned int **apic_pin_eoi;
>>> I think we can get away with this being uint8_t rather than unsigned
>>> int, especially as we're allocating memory when not strictly necessary.
>>>
>>> The only sentinel value we use is IRQ_VECTOR_UNASSIGNED which is -1.
>>>
>>> Vector 0xff is strictly SPIV and not allocated for anything else, so can
>>> be reused as a suitable sentinel here.
>> Actually, vectors 0 thru 0x0f are also strictly invalid, and could be
>> used as sentinels.  That's probably better than trying to play integer
>> promotion games between IRQ_VECTOR_UNASSIGNED and uint8_t.
> I've been giving some thought about this further, and I don't think
> the above is accurate.  While vectors 0 thru 0x0f are strictly
> invalid, the EOI handle in AMD-Vi is not a vector, but an offset into
> the IR table.  Hence the range of valid handles is 0 to 0xff.

Yeah, that occurred to me after sending.  With IR active, it's no longer
an architectural vector, and can have any 8-bit value.

> So the type of apic_pin_eoi needs to account for 0 to 0xff plus one
> sentinel.  We could use uint16_t or int16_t, but at that point it
> might be better to just use unsigned int?

Either of those are still half the allocated memory vs unsigned int, so
worth it IMO.

~Andrew
diff mbox series

Patch

diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
index e40d2f7dbd75..8856eb29d275 100644
--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -71,6 +71,22 @@  static int apic_pin_2_gsi_irq(int apic, int pin);
 
 static vmask_t *__read_mostly vector_map[MAX_IO_APICS];
 
+/*
+ * Store the EOI handle when using interrupt remapping.
+ *
+ * If using AMD-Vi interrupt remapping the IO-APIC redirection entry remapped
+ * format repurposes the vector field to store the offset into the Interrupt
+ * Remap table.  This causes directed EOI to longer work, as the CPU vector no
+ * longer matches the contents of the RTE vector field.  Add a translation
+ * table so that directed EOI uses the value in the RTE vector field when
+ * interrupt remapping is enabled.
+ *
+ * Note Intel VT-d Xen code still stores the CPU vector in the RTE vector field
+ * when using the remapped format, but use the translation table uniformly in
+ * order to avoid extra logic to differentiate between VT-d and AMD-Vi.
+ */
+static unsigned int **apic_pin_eoi;
+
 static void share_vector_maps(unsigned int src, unsigned int dst)
 {
     unsigned int pin;
@@ -273,6 +289,13 @@  void __ioapic_write_entry(
     {
         __io_apic_write(apic, 0x11 + 2 * pin, eu.w2);
         __io_apic_write(apic, 0x10 + 2 * pin, eu.w1);
+        /*
+         * Might be called before apic_pin_eoi is allocated.  Entry will be
+         * updated once the array is allocated and there's an EOI or write
+         * against the pin.
+         */
+        if ( apic_pin_eoi )
+            apic_pin_eoi[apic][pin] = e.vector;
     }
     else
         iommu_update_ire_from_apic(apic, pin, e.raw);
@@ -298,9 +321,17 @@  static void __io_apic_eoi(unsigned int apic, unsigned int vector, unsigned int p
     /* Prefer the use of the EOI register if available */
     if ( ioapic_has_eoi_reg(apic) )
     {
+        if ( apic_pin_eoi )
+            vector = apic_pin_eoi[apic][pin];
+
         /* If vector is unknown, read it from the IO-APIC */
         if ( vector == IRQ_VECTOR_UNASSIGNED )
+        {
             vector = __ioapic_read_entry(apic, pin, true).vector;
+            if ( apic_pin_eoi )
+                /* Update cached value so further EOI don't need to fetch it. */
+                apic_pin_eoi[apic][pin] = vector;
+        }
 
         *(IO_APIC_BASE(apic)+16) = vector;
     }
@@ -1022,7 +1053,23 @@  static void __init setup_IO_APIC_irqs(void)
 
     apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
+    if ( iommu_intremap )
+    {
+        apic_pin_eoi = xzalloc_array(typeof(*apic_pin_eoi), nr_ioapics);
+        BUG_ON(!apic_pin_eoi);
+    }
+
     for (apic = 0; apic < nr_ioapics; apic++) {
+        if ( iommu_intremap )
+        {
+            apic_pin_eoi[apic] = xmalloc_array(typeof(**apic_pin_eoi),
+                                               nr_ioapic_entries[apic]);
+            BUG_ON(!apic_pin_eoi[apic]);
+
+            for ( pin = 0; pin < nr_ioapic_entries[apic]; pin++ )
+                apic_pin_eoi[apic][pin] = IRQ_VECTOR_UNASSIGNED;
+        }
+
         for (pin = 0; pin < nr_ioapic_entries[apic]; pin++) {
             /*
              * add it to the IO-APIC irq-routing table: